# No encoding supplied: defaulting to UTF-8 ?

# R Options
options(stringsAsFactors=FALSE,
        citation_format="pandoc", 
        dplyr.summarise.inform=FALSE, 
        knitr.table.format="html",
        kableExtra_view_html=TRUE,
        future.globals.maxSize=2000000000, mc.cores=4, 
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

# Python3 needed for clustering, umap, other python packages
# Path to binary will be automatically found
# Set manually if it does not work
reticulate_python3_path = unname(Sys.which("python3"))
reticulate_python3_path = "C:/Program Files/Python38/python.exe"
Sys.setenv(RETICULATE_PYTHON=reticulate_python3_path)

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for 'leiden' clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat

# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix, sctransform, glmGamPoi (optional for speed but only available for R 4.0)
# Tables: kableExtra, DT
# Plots: ggsci
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast, limma (for a more efficient implementation of the Wilcoxon Rank Sum Test according to Seurat)
# Functional enrichment: enrichR
# Other: sessioninfo, cerebroApp, sceasy, ROpenSci/bibtex, knitcitations

# Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source='fold-hide',      # by default collapse code blocks
                      dev=c('png', 'pdf'),           # create figures in png and pdf; the first device (png) will be used for HTML output
                      dev.args=list(png=list(type="cairo"),  # png: use cairo - works on cluster, supports anti-aliasing (more smooth)
                                    pdf=list(bg="white")),     # pdf: use cairo - works on cluster, supports anti-aliasing (more smooth)
                      dpi=96,                        # figure resolution
                      fig.retina=2                   # retina multiplier
)

# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
invisible(knitcitations::citep(citation("knitr")))

Dataset description

This section should contain a short description of the experiment and the data.

Project-specific parameters

This code chunk contains all parameters that are set specifically for the project.

param = list()

####################
# Input parameters #
####################
# Project ID
param$project_id = "pbmc"

# Path to input data
param$path_data = data.frame(name=c("pbmc_10x","pbmc_smartseq2"),
                             type=c("10x","smartseq2"),
                             path=c("test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/", "test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz"),  
                             stats=c(NA, NA))

# Downsample data to at most n cells per sample (mainly for tests)
#   NULL to deactivate
param$downsample_cells_n = NULL

# Path to output directory
param$path_out = "test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/"

# Marker genes based on literature, translated to Ensembl IDs
#   xlsx file, one list per column, first row as header and Ensembl IDs below
#   NULL if no known marker genes should be plotted
param$file_known_markers = "test_datasets/10x_SmartSeq2_pbmc_GSE132044/known_markers.xlsx"

# Annotation via biomaRt
param$mart_dataset = "hsapiens_gene_ensembl"
param$annot_version = 98
param$annot_main = c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="entrezgene_accession")
param$mart_attributes = c(param$annot_main, 
                          c("chromosome_name", "start_position", "end_position", 
                            "percentage_gene_gc_content", "gene_biotype", "strand", "description"))
param$biomart_mirror = NULL

# Where to store the annotation so that it has to be fetched only one
# if NULL will be in output_directory/annotation
param$file_annot = NULL

# Prefix for mitochondrial genes 
param$mt = "^MT-"

#####################
# Filter parameters #
#####################
# Filter for cells
param$cell_filter = list(nFeature_RNA=c(200, NA), percent_mt=c(NA, 20))

# Filter for features
param$feature_filter = list(min_counts=1, min_cells=3) # feature has to be found by at least one count in one cell

# Samples to drop
# Cells from these samples will be dropped after initial QC
# Example: param$samples_to_drop = c("pbmc_smartseq2_NC", "pbmc_smartseq2_RNA"), 
#   where "pbmc_smartseq2" is the name of the dataset, and "NC" and "RNA" are the names of the subsamples
param$samples_to_drop = c() 

# Drop samples with too few cells
param$samples_min_cells = 10

############################
# Normalisation parameters #
############################
# Which normalisation should be used for analysis?
#   "RNA" or "SCT"
param$norm = "RNA"

# Whether or not to remove cell cycle effects
param$cc_remove = FALSE

# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
param$cc_remove_all = FALSE

# Whether or not to re-score cell cycle effects after data
#   from different samples have been merged/integrated
param$cc_rescore_after_merge = TRUE

# Additional (unwanted) variables that will be regressed out for visualisation and clustering
param$vars_to_regress = c()

# When there are multiple datasets, how to combine them:
#   - method:
#     - "single": Default when there is only one dataset after filtering, no integration is needed
#     - "merge": Merge (in other words, concatenate) data when no integration is needed, 
#                       e.g. when samples were multiplexed on the same chip.
#     - "integrate": Anchors are computed for all pairs of datasets. This will give all datasets the same weight
#                       during dataset integration but can be computationally intensive
#
# Additional options for the "integrate" method:
#
#   - dimensions: Number of dimensions to consider for integration
#   - reference: Use one or more datasets as reference and compute anchors for all other datasets. Separate multiple datasets by comma
#                           This is computationally faster but less accurate.
#   - use_reciprocal_pca: Compute anchors in PCA space. Even more computationally faster but again less accurate 
#                       Recommended for big datasets.
#   - k.filter: How many neighbors to use when filtering anchors (default: min(200, minimum number of cells in a sample))
#   - k.weight: Number of neighbors to consider when weighting anchors (default: min(100, minimum number of cells in a sample))
#   - k.anchor: How many neighbors to use when picking anchors (default: min(5, minimum number of cells in a sample))
#   - k.score: How many neighbors to use when scoring anchors (default: min(30, minimum number of cells in a sample))
param$integrate_samples = list(method="integrate", dimensions=30, reference=NULL, use_reciprocal_pca=FALSE)

# TO DISCUSS from Seurat vignette:
# The results show that rpca-based integration is more conservative, and in this case, do not perfectly align a subset of cells (which are naive and memory T cells) across experiments. You can increase the strength of alignment by increasing the k.anchor parameter, which is set to 5 by default. Increasing this parameter to 20 will assist in aligning these populations.

# The number of PCs to use; adjust this parameter based on the Elbowplot 
param$pc_n = 10

# Resolution of clusters; low values will lead to fewer clusters of cells 
param$cluster_resolution=0.5

#######################################################
# Marker genes and genes with differential expression #
#######################################################
# Thresholds to define marker genes
param$marker_padj = 0.05
param$marker_log2FC = log2(2)
param$marker_pct = 0.25

# Additional (unwanted) variables to account for in statistical tests
param$latent_vars = c()

# Contrasts to find differentially expressed genes (R data.frame or Excel file)
# Required columns:
# condition_column: Categorial column in the cell metadata; specify "orig.ident" for sample and "seurat_clusters" for cluster
# condition_group1: Condition levels in group 1, multiple levels concatenated by the plus character
#                     Empty string = all levels not in group2 (cannot be used if group2 is empty)
# condition_group2: Condition levels in group 2, multiple levels concatenated by the plus character
#                     Empty string = all levels not in group1 (cannot be used if group1 is empty)
#
# Optional columns:
# subset_column: Categorial column in the cell metadata to subset before testing (default: NA)
#                  Specify "orig.ident" for sample and "seurat_clusters" for cluster 
# subset_group: Further subset levels (default: NA)
#                 For the individual analysis of multiple levels separate by semicolons
#                 For the joint analysis of multiple levels concatenate by the plus character 
#                 For the individual analysis of all levels empty string ""
# assay: Seurat assay to test on; can also be a Seurat dimensionality reduction (default: "RNA")
# slot: In case assay is a Seurat assay object, which slot to use (default: "data")
# padj: Maximum adjusted p-value (default: 0.05)
# log2FC: Minimum absolute log2 fold change (default: 0)
# min_pct: Minimum percentage of cells expressing a gene to test (default: 0.1)
# test: Type of test; "wilcox", "bimod", "roc", "t", "negbinom", "poisson", "LR", "MAST", "DESeq2"; (default: "wilcox")
# downsample_cells_n: Downsample each group to at most n cells to speed up tests (default: NA)
# latent_vars: Additional variables to account for; multiple variables need to be concatenated by semicolons; will overwrite the default by param$latent_vars (default: none).
param$deg_contrasts = data.frame(condition_column=c("orig.ident", "orig.ident", "Phase"),
                                 condition_group1=c("pbmc_10x", "pbmc_10x", "G1"),
                                 condition_group2=c("pbmc_smartseq2_sample1", "pbmc_smartseq2_sample1", "G2M"),
                                 subset_column=c(NA, "seurat_clusters", "seurat_clusters"),
                                 subset_group=c(NA, "", "1;2"),
                                 downsample_cells_n=c(NA, 50, 30),
                                 log2FC=log2(c(1.5, 1.5, 1.5)))

# P-value threshold for functional enrichment tests
param$enrichr_padj = 0.05

# Enrichr databases of interest
param$enrichr_dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")

######################
# General parameters #
######################
# Main colour to use for plots
param$col = "palevioletred"

# Colour palette used for samples
param$col_palette_samples = "ggsci::pal_jama"

# Colour palette used for cluster
param$col_palette_clusters = "ggsci::pal_igv"

# Path to git repository
param$path_to_git = "."

# Debugging mode: 
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions 
param$debugging_mode = "default_debugging"
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_io.R",
                        "functions_plotting.R",
                        "functions_analysis.R",
                        "functions_degs.R",
                        "functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))

# Debugging mode: 
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions 
switch (param$debugging_mode, 
        default_debugging=on_error_default_debugging(), 
        terminal_debugger=on_error_start_terminal_debugger(),
        print_traceback=on_error_just_print_traceback(),
        on_error_default_debugging())

# Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)

# Create output directories
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "marker_degs"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "annotation"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)
dir.create(file.path(param$path_out, "export"), recursive=TRUE, showWarnings=FALSE)

# Path for figures in png and pdf format (trailing '/' is needed)
knitr::opts_chunk$set(fig.path=paste0(file.path(param$path_out, "figures"), "/"))

# Path for annotation
if (is.null(param$file_annot)) param$file_annot = file.path(param$path_out, "annotation", paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))

# Do checks
error_messages = c()

# Check parameters and parse entries (e.g. numbers) so that they are valid
param = check_parameters_scrnaseq(param)
error_messages = c(error_messages, param[["error_messages"]])
# Check installed packages
error_messages = c(error_messages, check_installed_packages_scrnaseq())
# Check python
error_messages = c(error_messages, check_python())
# Check pandoc
error_messages = c(error_messages, check_pandoc())
# Check enrichR
error_messages = c(error_messages, check_enrichr(param$enrichr_dbs))
# Check ensembl
error_messages = c(error_messages, check_ensembl(biomart="ensembl", 
                                                 dataset=param$mart_dataset, 
                                                 mirror=param$biomart_mirror, 
                                                 version=param$annot_version,
                                                 attributes=param$mart_attributes,
                                                 file_annot=param$file_annot))

# Stop here if there are errors
if (length(error_messages)) stop(paste(c("", paste("*", error_messages)), collapse="\n"))

Read data

The workflow works with pre-processed 10x data, as well as pre-processed SmartSeq-2 and other data that are represented by a simple table for counts per gene and cell.

Pre-processing of 10x data with Cell Ranger

We use the 10x Genomics Cell Ranger software to process 10x sequencing reads. The result is a matrix with counts per feature (rows) and cell (columns). Features can be genes for measuring gene expression or antibodies for measuring protein levels. To save disk space, the matrix is converted into a condensed format and described by the following three files:

  • features.tsv.gz: A gzipped tabular file with information about the features (feature id, feature symbol and type)
  • barcodes.tsv.gz: A gzipped tabular file with the barcode for each cell, i.e. the information about the cells
  • matrix.mtx: A sparse version of the count matrix where the first column contains the row number of the feature from the feature table, the second column the row number of the cell from the barcodes table and the third column contains the count.
Cell Ranger also produces a file with mapping statistics, which is printed here.

Read and print mapping statistics

We begin by printing mapping statistics that have been produced prior to this workflow.

# Are statistics provided?
if (!all(is.na(param$path_data$stats))) { 
  
  # Loop through all samples and read mapping stats
  mapping_stats_list = list()
  for (i in 1:nrow(param$path_data)) {  
    if (!is.na(param$path_data$stats[i])) { 
      mapping_stats_list[[param$path_data$name[i]]] = read.delim(param$path_data$stats[i], 
                                                                 sep=",", header=FALSE, check.names=FALSE) %>%
        t() %>% as.data.frame()
    } 
  }
  
  # Join all mapping stats tables
  mapping_stats = mapping_stats_list %>% purrr::reduce(dplyr::full_join, by="V1")
  rownames(mapping_stats) = mapping_stats[["V1"]]
  mapping_stats = mapping_stats %>% dplyr::select(-V1)
  colnames(mapping_stats) = names(mapping_stats_list)
 
  # Print table to HTML 
  knitr::kable(mapping_stats, align="l", caption="Mapping statistics") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
 
} else { 
  message("Mapping statistics cannot be shown. No valid file provided.")
}

× (Message)
Mapping statistics cannot be shown. No valid file provided.

Read gene annotation

If not provided already, we read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat names.

# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
  annot_ensembl = read.delim(param$file_annot)
} else {
  annot_mart = suppressWarnings(GetBiomaRt(biomart="ensembl", 
                                           dataset=param$mart_dataset, 
                                           mirror=param$biomart_mirror, 
                                           version=param$annot_version))
  annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes, useCache=FALSE)
  write.table(annot_ensembl, file=param$file_annot, sep='\t', col.names=TRUE, row.names=FALSE, append=FALSE)
  message("Gene annotation file was created at: ", param$file_annot)
  # Note: depending on the attributes, there might be more than one row per gene
}

# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
  stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}

# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]

# Ensembl id to gene symbol
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])

# Ensembl id to seurat-compatible unique rowname
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])

# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)

# Gene symbol to ensembl id: named LIST to account for genes where one symbol translates to multiple Ensembl IDs
#symbol_to_ensembl = unique(annot_ensembl[, c(ensembl, symbol)])
#symbol_to_ensembl = split(symbol_to_ensembl[, ensembl], symbol_to_ensembl[, symbol])

# Gene symbol to (seurat compatible unique) gene symbol: named LIST to account for genes with multiple names
#symbol_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
#symbol_to_seurat_rowname$seurat_rowname = ensembl_to_seurat_rowname[symbol_to_seurat_rowname[, ensembl]]
#symbol_to_seurat_rowname = split(symbol_to_seurat_rowname$seurat_rowname, symbol_to_seurat_rowname[, symbol])

# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])

# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})

# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]
# Use biomart to translate human cell cycle genes to the species of interest and save them in a file
cc_genes_marker_file = file.path(param$path_out, "annotation", "cell_cycle_markers.xlsx")

if (file.exists(cc_genes_marker_file)) {
  # Load from file
  genes_s = openxlsx::read.xlsx(cc_genes_marker_file, sheet=1)
  genes_g2m = openxlsx::read.xlsx(cc_genes_marker_file, sheet=2)
  
} else { 
  # Obtain from Ensembl
  # Note: both mart objects must point to the same mirror for biomarT::getLDS to work
  mart_human = suppressWarnings(GetBiomaRt(biomart="ensembl", 
                                           dataset="hsapiens_gene_ensembl", 
                                           mirror=param$biomart_mirror, 
                                           version=param$annot_version))
  mart_myspecies = suppressWarnings(GetBiomaRt(biomart="ensembl", 
                                               dataset=param$mart_dataset, 
                                               mirror=GetBiomaRtMirror(mart_human), 
                                               version=param$annot_version)) 
  
  # S phase marker
  genes_s = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"), 
                          filters="external_gene_name", 
                          values=Seurat::cc.genes.updated.2019$s.genes, 
                          mart=mart_human, 
                          attributesL=c("ensembl_gene_id", "external_gene_name"), 
                          martL=mart_myspecies, 
                          uniqueRows=TRUE)
  colnames(genes_s) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
  
  # G2/M marker
  genes_g2m = biomaRt::getLDS(attributes=c("ensembl_gene_id", "external_gene_name"), 
                            filters="external_gene_name", 
                            values=Seurat::cc.genes.updated.2019$g2m.genes, 
                            mart=mart_human, 
                            attributesL=c("ensembl_gene_id", "external_gene_name"), 
                            martL=mart_myspecies, 
                            uniqueRows=TRUE)
  colnames(genes_g2m) = c("Human_ensembl_id", "Human_gene_name", "Species_ensembl_id", "Species_gene_name")
  
  # Write to file
  openxlsx::write.xlsx(list(S_phase=genes_s,G2M_phase=genes_g2m),file=cc_genes_marker_file)
}

# Convert Ensembl ID to Seurat-compatible unique rowname
genes_s = data.frame(Human_gene_name=genes_s$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_s$Species_ensembl_id]))
genes_g2m = data.frame(Human_gene_name=genes_g2m$Human_gene_name, Species_gene_name=unname(ensembl_to_seurat_rowname[genes_g2m$Species_ensembl_id]))

Read scRNA-seq data

We next read the scRNA-seq dataset(s) into Seurat.

# List of Seurat objects
sc = list()

datasets = param$path_data
for (i in seq(nrow(datasets))) {
  name = datasets[i,"name"]
  type = datasets[i,"type"]
  path = datasets[i,"path"]
  suffix = datasets[i,"suffix"]
  
  # Read 10X or smartseq2
  if (type == "10x") {
    # Read 10X sparse matrix into a Seurat object
    sc = c(sc, ReadSparseMatrix(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, cellnames_suffix=suffix))
    
  } else if (type == "smartseq2") {
    # Read counts table into a Seurat object
    sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE, cellnames_suffix=suffix))
  } 
}

# Make sure that sample names are unique. If not, just prefix with the dataset name. Also set orig.ident to this name.
sample_names = names(sc)
duplicated_sample_names_idx = which(sample_names %in% sample_names[duplicated(sample_names)])
for (i in duplicated_sample_names_idx) {
  sample_names[i] = paste(head(sc[[i]][["orig.dataset", drop=TRUE]], 1), sample_names[i], sep=".")
  sc[[i]][["orig.ident"]] = sample_names[i]
}

# Set up colors for samples and add them to the sc objects
sample_names = purrr::flatten_chr(purrr::map(sc, function(s) {
  nms = unique(as.character(s[[]][["orig.ident"]]))
  return(nms) 
}))
param$col_samples = GenerateColours(num_colours=length(sample_names), names=sample_names, palette=param$col_palette_samples, alphas=1)
sc = purrr::map(sc, ScAddLists, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")

# Downsample cells if requested
if (param$downsample_cells_n > 0) {
  sc = purrr::map(sc, function(s) {
    cells = ScSampleCells(sc=s, n=param$downsample_cells_n, seed=1)
    return(subset(s, cells=cells))
  })
}

sc
## $pbmc_10x
## An object of class Seurat 
## 33694 features across 4033 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)
## 
## $pbmc_smartseq2_sample1
## An object of class Seurat 
## 33694 features across 311 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)

The following first table shows metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), or the above mentioned number of unique genes detected (“nFeature_RNA”). The second table shows metadata (columns) of the first 5 genes (rows).

# Combine cell metadata of the Seurat objects into one big metadata
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s) return(s[[]])) %>% as.data.frame())

# Print cell metadata
knitr::kable(head(sc_cell_metadata), align="l", caption="Cell metadata, top 5 rows") %>%
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))  %>% 
  kableExtra::scroll_box(width="100%")
Cell metadata, top 5 rows
orig.ident nCount_RNA nFeature_RNA orig.dataset SampleName PlateNumber PlateRow PlateCol
PBMC1_10x_AAACCCACACTTGGGC-1 pbmc_10x 7552 2037 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCACAGGTGGAT-1 pbmc_10x 4773 1800 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCAGTGCTTATG-1 pbmc_10x 4430 1565 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCATCCGTAGTA-1 pbmc_10x 4512 1592 pbmc_10x NA NA NA NA
PBMC1_10x_AAACCCATCTTACACT-1 pbmc_10x 6663 1919 pbmc_10x NA NA NA NA
PBMC1_10x_AAACGAAGTCTAGTGT-1 pbmc_10x 143 89 pbmc_10x NA NA NA NA
# Print gene metadata
knitr::kable(head(sc[[1]][["RNA"]][[]], 5), align="l", caption="Feature metadata, top 5 rows (only first dataset shown)") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))  %>% 
  kableExtra::scroll_box(width="100%")
Feature metadata, top 5 rows (only first dataset shown)
feature_id feature_name feature_type
TSPAN6 ENSG00000000003 TSPAN6 Gene Expression
TNMD ENSG00000000005 TNMD Gene Expression
DPM1 ENSG00000000419 DPM1 Gene Expression
SCYL3 ENSG00000000457 SCYL3 Gene Expression
C1orf112 ENSG00000000460 C1orf112 Gene Expression

Pre-processing

Quality control

We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature_RNA”), the total number of molecules detected in each cell (“nCount_RNA”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).

# If filters were specified globally (i.e. not by sample), this chunk will copy them for each sample such that downstream filtering can work by sample
param$cell_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$cell_filter)) {
    return(param$cell_filter[[s]])
  } else {
    return(param$cell_filter)
  }
})

param$feature_filter = purrr::map(list_names(sc), function(s) {
  if (s %in% names(param$feature_filter)) {
    return(param$feature_filter[[s]])
  } else {
    return(param$feature_filter)
  }
})
# Calculate percentage of counts in mitochondrial genes for each Seurat object
sc = purrr::map(sc, function(s) {
  mt_features = grep(pattern=param$mt, rownames(s), value=TRUE)
  return(Seurat::PercentageFeatureSet(s, features=mt_features, col.name="percent_mt", assay="RNA"))
})

# Calculate percentage of counts in ribosomal genes for each Seurat object
sc = purrr::map(sc, function(s) {
  ribo_features = grep(pattern="^RP[SL]", rownames(s), value=TRUE, ignore.case=TRUE)
  return(Seurat::PercentageFeatureSet(s, features=ribo_features, col.name="percent_ribo", assay="RNA"))
})

# Calculate percentage of counts in ERCC for each Seurat object (if assay is available)
sc = purrr::map(sc, function(s) {
  if ("ERCC" %in% Seurat::Assays(s)) s$percent_ercc = s$nCount_ERCC/(s$nCount_ERCC + s$nCount_RNA)*100
  return(s)
  })

# Combine (again) cell metadata of the Seurat objects into one big metadata, this time including mt and ercc 
sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ return(s[[]]) }) %>% as.data.frame())
# Only RNA assay at the moment
# counts_median uses sapply on the counts matrix, which converts the sparse matrix into a normal matrix
#   This might have to be adapted in future (Sparse Matrix Apply Function)
sc = purrr::map(list_names(sc), function(n) {
  # Calculate percentage of counts per gene in a cell
  counts_rna = Seurat::GetAssayData(sc[[n]], slot="counts", assay="RNA")
  total_counts = sc[[n]][["nCount_RNA", drop=TRUE]]
  counts_rna_perc = Matrix::t(Matrix::t(counts_rna)/total_counts)*100

  # Calculate feature filters
  num_cells_expr = Matrix::rowSums(counts_rna >= 1)
  num_cells_expr_threshold = Matrix::rowSums(counts_rna >= param$feature_filter[[n]][["min_counts"]])
  
  # Calculate median of counts_rna_perc per gene 
  counts_median = apply(counts_rna_perc, 1, median)
  
  # Add all QC measures as metadata
  sc[[n]][["RNA"]] = Seurat::AddMetaData(sc[[n]][["RNA"]], data.frame(num_cells_expr, num_cells_expr_threshold, counts_median))
  return(sc[[n]])
})
# Plot QC metrics for cells
cell_qc_features = c("nFeature_RNA", "nCount_RNA", "percent_mt")
if ("percent_ercc" %in% colnames(sc_cell_metadata)) cell_qc_features = c(cell_qc_features, "percent_ercc")
cell_qc_features = values_to_names(cell_qc_features)

p_list = list()
for (i in names(cell_qc_features)) {
  p_list[[i]]= ggplot(sc_cell_metadata[, c("orig.ident", i)], aes_string(x="orig.ident", y=i, fill="orig.ident", group="orig.ident")) +
    geom_violin(scale="width")

  # Adds points for samples with less than three cells since geom_violin does not work here
  p_list[[i]] = p_list[[i]] + 
    geom_point(data=sc_cell_metadata[, c("orig.ident", i)] %>% dplyr::filter(orig.ident %in% names(which(table(sc_cell_metadata$orig.ident) < 3))), aes_string(x="orig.ident", y=i, fill="orig.ident"), shape=21, size=2)
  
  # Now add styles
  p_list[[i]] = p_list[[i]] + 
    AddStyle(title=i, legend_position="none", fill=param$col_samples, xlab="") + 
    theme(axis.text.x=element_text(angle=45, hjust=1))
  
  # Creates a table with min/max values for filter i for each dataset
  cell_filter_for_plot = purrr::map_dfr(names(param$cell_filter), function(n) {
    # If filter i in cell filter of the dataset, then create dataframe with columns orig.ident, threshold and value
    if (i %in% names(param$cell_filter[[n]])){
      data.frame(orig.ident=n, threshold=c("min", "max"), value=param$cell_filter[[n]][[i]], stringsAsFactors=FALSE)
    } 
  })
  
  # Add filters as segments to plot
  if (nrow(cell_filter_for_plot) > 0) {
    # Remove entries that are NA
    cell_filter_for_plot = cell_filter_for_plot %>% dplyr::filter(!is.na(value))
    p_list[[i]] = p_list[[i]] + geom_segment(data=cell_filter_for_plot, 
                                             aes(x=as.integer(as.factor(orig.ident))-0.5, 
                                                 xend=as.integer(as.factor(orig.ident))+0.5, 
                                                 y=value, yend=value), 
                                             lty=2, col="firebrick")
  }
}
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Distribution of feature values") 
p

# Correlate QC metrics for cells
p_list = list()
p_list[[1]] = ggplot(sc_cell_metadata, aes_string(x=cell_qc_features[2], y=cell_qc_features[1], colour="orig.ident")) +
  geom_point() + 
  AddStyle(col=param$col_samples)
p_list[[2]] = ggplot(sc_cell_metadata, aes_string(x=cell_qc_features[3], y=cell_qc_features[1], colour="orig.ident")) +
  geom_point() + 
  AddStyle(col=param$col_samples)
p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Features plotted against each other")

if (length(sc)==1) {
  p = p & theme(legend.position="bottom")
} else {
  p = p + patchwork::plot_layout(guides="collect") & theme(legend.position="bottom")
}
p

Genes with highest expression

We next investigate whether there are individual genes that are represented by an unusually high number of counts. For each cell, we first calculate the percentage of counts per gene. Subsequently, for each gene, we calculate the median value of these percentages. Genes with the highest median percentage of counts are plotted below.

# Plot only samples that we intend to keep 
sc_names = names(sc)[!(names(sc) %in% param$samples_to_drop)]
genes_highestExpr = lapply(sc_names, function(i) {
  top_ten_exp = sc[[i]][["RNA"]][["counts_median"]] %>% dplyr::arrange(dplyr::desc(counts_median)) %>% head(n=10)
  return(rownames(top_ten_exp))
  }) %>%
  unlist() %>%
  unique()

genes_highestExpr_counts = purrr::map_dfc(sc[sc_names], .f=function(s) s[["RNA"]][["counts_median"]][genes_highestExpr, ]) 
genes_highestExpr_counts$gene = genes_highestExpr
genes_highestExpr_counts = genes_highestExpr_counts %>% tidyr::pivot_longer(cols=all_of(sc_names))
genes_highestExpr_counts$name = factor(genes_highestExpr_counts$name, levels=sc_names)

col =  GenerateColours(num_colours=length(genes_highestExpr), names=genes_highestExpr, palette="ggsci::pal_simpsons")
p = ggplot(genes_highestExpr_counts, aes(x=name, y=value, col=gene, group=gene)) + 
  geom_point() + 
  AddStyle(title="Top 10 highest expressed genes per sample, added into one list", 
           xlab="Sample", ylab="Median % of raw counts\n per gene in a cell", 
           legend_position="bottom", 
           col=col)
if (length(unique(genes_highestExpr_counts$name))>1) p = p + geom_line()
p

Filtering

Cells and genes are filtered based on the following thresholds:

cell_filter_lst = param$cell_filter %>% unlist(recursive=FALSE)
is_numeric_filter = purrr::map_lgl(cell_filter_lst, function(f) return(is.numeric(f) & length(f)==2))

# numeric cell filters
if (length(cell_filter_lst[is_numeric_filter]) > 0) {
  purrr::invoke(rbind, cell_filter_lst[is_numeric_filter]) %>%
    knitr::kable(align="l", caption="Numeric filters applied to cells", col.names=c("Min", "Max")) %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
Numeric filters applied to cells
Min Max
pbmc_10x.nFeature_RNA 200 NA
pbmc_10x.percent_mt NA 20
pbmc_smartseq2_sample1.nFeature_RNA 200 NA
pbmc_smartseq2_sample1.percent_mt NA 20
# categorial cell filters
if (length(cell_filter_lst[!is_numeric_filter]) > 0) {
purrr::invoke(rbind, cell_filter_lst[!is_numeric_filter] %>% purrr::map(paste, collapse=",")) %>%
  knitr::kable(align="l", caption="Categorial filters applied to cells", col.names=c("Values")) %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
  
# gene filters
feature_filter_lst = param$feature_filter %>% unlist(recursive=FALSE)
if (length(feature_filter_lst) > 0) {
  purrr::invoke(rbind, feature_filter_lst) %>% 
    knitr::kable(align="l", caption="Filters applied to genes", col.names=c("Value")) %>%
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
}
Filters applied to genes
Value
pbmc_10x.min_counts 1
pbmc_10x.min_cells 3
pbmc_smartseq2_sample1.min_counts 1
pbmc_smartseq2_sample1.min_cells 3

The number of excluded cells and features is as follows:

# Iterate over datasets and filters
# Record a cell if it does not pass the filter
# Also record a cell if it belongs to a sample that should be dropped
sc_cells_to_exclude  = purrr::map(list_names(sc), function(n) { 
  filter_result = purrr::map(list_names(param$cell_filter[[n]]), function(f) {
    filter = param$cell_filter[[n]][[f]]
    if (is.numeric(filter)) {
      if (is.na(filter[1])) filter[1] = -Inf # Minimum
      if (is.na(filter[2])) filter[2] = Inf  # Maximum 
      idx_exclude = sc[[n]][[f, drop=TRUE]] < filter[1] | sc[[n]][[f, drop=TRUE]] > filter[2]
      return(names(which(idx_exclude)))
    } else if (is.character(filter)) { 
      idx_exclude = !sc[[n]][[f, drop=TRUE]] %in% filter
      return(Cells(sc[[n]])[idx_exclude])
    }
  })

  # Samples to drop
  if (n %in% param$samples_to_drop) {
    filter_result[["samples_to_drop"]] = colnames(sc[[n]])
  } else {
    filter_result[["samples_to_drop"]] = as.character(c())
  }
  
  # Minimum number of cells for a sample to keep
  if (ncol(sc[[n]]) < param$samples_min_cells) {
    filter_result[["samples_min_cells"]] = colnames(sc[[n]])
  } else {
    filter_result[["samples_min_cells"]] = as.character(c())
  }
  
  return(filter_result)
})

# Summarise
sc_cells_to_exclude_summary = purrr::map_dfr(sc_cells_to_exclude, function(s) {
  return(as.data.frame(purrr::map(s, length))) 
  })
rownames(sc_cells_to_exclude_summary) = names(sc_cells_to_exclude)
knitr::kable(sc_cells_to_exclude_summary, align="l", caption="Number of excluded cells") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) 
Number of excluded cells
nFeature_RNA percent_mt samples_to_drop samples_min_cells
pbmc_10x 286 271 0 0
pbmc_smartseq2_sample1 1 0 0 0
# Now filter, drop the respective colours and adjust integration method
sc = purrr::map(list_names(sc), function(n) {
  cells_to_keep = Cells(sc[[n]])
  cells_to_keep = cells_to_keep[!cells_to_keep %in% purrr::flatten_chr(sc_cells_to_exclude[[n]])]
  if (length(cells_to_keep)==0) return(NULL)
  else return(subset(sc[[n]], cells=cells_to_keep))
}) %>% purrr::discard(is.null)

if (length(sc)==1) param$integrate_samples[["method"]] = "single"
# Only RNA assay at the moment

# Iterate over datasets and record a feature if it does not pass the filter
sc_features_to_exclude = purrr::map(list_names(sc), function(n) {
  if (length(Cells(sc[[n]])) < param$feature_filter[[n]][["min_cells"]]) return(list())
  else return(names(which(sc[[n]][["RNA"]][["num_cells_expr_threshold", drop=TRUE]] < param$feature_filter[[n]][["min_cells"]])))
})

# Summarise
sc_features_to_exclude_summary = purrr::map(sc_features_to_exclude, length) %>% 
  t() %>% as.data.frame() 
rownames(sc_features_to_exclude_summary) = c("Genes")
knitr::kable(sc_features_to_exclude_summary, align="l", caption="Number of excluded genes") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Number of excluded genes
pbmc_10x pbmc_smartseq2_sample1
Genes 13893 15754
# Now filter
sc = purrr::map(list_names(sc), function(n) {
  assay_names = Seurat::Assays(sc[[n]])
  features_to_keep = purrr::map(values_to_names(assay_names), function(a) {
    features = rownames(sc[[n]][[a]])
    keep = features[!features %in% sc_features_to_exclude[[n]]]
    return(keep)
  })
  return(subset(sc[[n]], features=purrr::flatten_chr(features_to_keep)))
})

After filtering, the size of the Seurat object is:

sc
## $pbmc_10x
## An object of class Seurat 
## 19801 features across 3608 samples within 1 assay 
## Active assay: RNA (19801 features, 0 variable features)
## 
## $pbmc_smartseq2_sample1
## An object of class Seurat 
## 17940 features across 310 samples within 1 assay 
## Active assay: RNA (17940 features, 0 variable features)

Normalisation

In this section, we subsequently run a series of Seurat functions for each provided sample:
1. We start by running a standard log normalisation, where counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
2. We assign cell cycle scores to each cell based on its normalised expression of G2/M and S phase markers. These scores are visualised in a separate section further below. If specified in the above parameter section, cell cycle effects are removed during scaling (step 3).
3. Dependent on the normalisation of your choice, we either
3a. Run standard functions to select variable genes, and scale normalised gene counts. For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly in others. To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score).
3b. Run SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling).

Note that removing all signal associated to cell cycle can negatively impact downstream analysis. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. As an alternative, we can remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences amongst proliferating cells will be removed. For a more detailed explanation, see the cell cycle vignette for Seurat “Satija Lab” (2020). Cell cycle effects removed for this report: FALSE; all cell cycle effects removed for this report: FALSE.

While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.

# Normalise data the original way
#   This is required to score cell cycle (https://github.com/satijalab/seurat/issues/1679)
sc = purrr::map(sc, Seurat::NormalizeData, normalization.method="LogNormalize", scale.factor=10000, verbose=FALSE)
# Determine cell cycle effect per sample 
sc = purrr::map(list_names(sc), function(n) {
  sc[[n]] = CCScoring(sc=sc[[n]], genes_s=genes_s[,2], genes_g2m=genes_g2m[,2], name=n)
  if (any(is.na(sc[[n]][["S.Score"]])) | any(is.na(sc[[n]][["G2M.Score"]]))) {
    param$cc_remove=FALSE
    param$cc_remove_all=FALSE
    param$cc_rescore_after_merge=FALSE
  }
  return(sc[[n]])
})

# If cell cycle effects should be removed, we first score cells 
# The effect is then removed in the following chunk 
if (param$cc_remove) {
# Add to vars that need to regressed out during normalisation
  if (param$cc_remove_all) {
    # Remove all signal associated to cell cycle
    param$vars_to_regress = unique(c(param$vars_to_regress, "S.Score", "G2M.Score"))
    param$latent_vars = unique(c(param$latent_vars, "S.Score", "G2M.Score"))
  } else {
    # Don't remove the difference between cycling and non-cycling cells 
    param$vars_to_regress = unique(c(param$vars_to_regress, "CC.Difference"))
    param$latent_vars = unique(c(param$latent_vars, "CC.Difference"))
  }  
}
if (param$norm == "RNA") { 
  # Find variable features from normalised data (unaffected by scaling)
  sc = purrr::map(sc, Seurat::FindVariableFeatures, selection.method="vst", nfeatures=3000, verbose=FALSE)
  
  # Scale 
  # Note: For a single dataset where no integration/merging is needed, all features can already be scaled here. 
  #   Otherwise, scaling of all features will be done after integration/merging.
  if (param$integrate_samples[["method"]]=="single") {
    sc[[1]] = Seurat::ScaleData(sc[[1]], 
                      features=rownames(sc[[1]][["RNA"]]),
                      vars.to.regress=param$vars_to_regress, 
                      verbose=FALSE) 
  }
} else if (param$norm == "SCT") {
  # Run SCTransform
  #
  # This is a new normalisation method that replaces previous Seurat functions 'NormalizeData', 'FindVariableFeatures', and 'ScaleData'. 
  # vignette: https://satijalab.org/seurat/v3.0/sctransform_vignette.html
  # paper: https://www.biorxiv.org/content/10.1101/576827v2
  # Normalised data end up here: sc@assays$SCT@data
  # Note: For a single dataset where no integration is needed, all features can already be scaled here. 
  #   Otherwise, it is enough to scale only the variable features.
  # Note: It is not guaranteed that all genes are successfully normalised with SCTransform. 
  #   Consequently, some genes might be missing from the SCT assay. 
  #   See: https://github.com/ChristophH/sctransform/issues/27
  # Note: The performance of SCTransform can be improved by using 'glmGamPoi' instead of 'poisson' as method for initial parameter estimation.
  sc = purrr::map(list_names(sc), function(n) { 
    SCTransform(sc[[n]], 
                vars.to.regress=param$vars_to_regress, 
                min_cells=param$feature_filter[[n]][["min_cells"]], 
                verbose=FALSE, 
                return.only.var.genes=!(param$integrate_samples[["method"]]=="single"),
                method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson")) 
  })
}

Variable genes

Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells.

To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in. Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (Hafemeister and Satija (2019)). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability. The top 3,000 variable genes are used for further analysis.

fig_height_vf = 5 * ceiling(length(names(sc))/2)
p_list = purrr::map(list_names(sc), function(n) {
  top10 = head(Seurat::VariableFeatures(sc[[n]], assay=param$norm), 10)
  p = Seurat::VariableFeaturePlot(sc[[n]], 
                                  assay=param$norm, 
                                  selection.method=ifelse(param$norm=="RNA", "vst", "sct"), 
                                  col=c("grey", param$col)) + 
    AddStyle(title=n) + 
    theme(legend.position=c(0.2, 0.8), legend.background=element_rect(fill=alpha("white", 0.0)))
  p = LabelPoints(plot=p, points=top10, repel=TRUE, xnudge=0, ynudge=0)
  return(p)
})

p = patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Variable genes")
p

Combining multiple samples

if (param$integrate_samples[["method"]]!="single") {
  
  # When merging, feature meta-data is removed by Seurat entirely; save separately for each assay except for SCT and add again afterwards
  # Note: not sure whether this is still needed - discuss
  assay_names = setdiff(unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } ))), "SCT")

  # Loop through all assays and accumulate meta data
  sc_feature_metadata = purrr::map(values_to_names(assay_names), function(a) {
    # "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
    # This step is skipped for assays that do not contain all three types of feature information
    contains_neccessary_columns = purrr::map_lgl(list_names(sc), function(n) { 
      all(c("feature_id", "feature_name", "feature_type") %in% colnames(sc[[n]][[a]][[]])) 
      })

    if (all(contains_neccessary_columns)) {
      feature_id_name_type = purrr::map(sc, function(s) return(s[[a]][[c("feature_id", "feature_name", "feature_type")]]) )
      feature_id_name_type = purrr::reduce(feature_id_name_type, function(df_x, df_y) {
        new_rows = which(!rownames(df_y) %in% rownames(df_x))
        if (length(new_rows) > 0) return(rbind(df_x, df_y[new_rows, ]))
        else return(df_x)
      })
      feature_id_name_type$row_names = rownames(feature_id_name_type)
    } else {
      feature_id_name_type = NULL
    }
    
    # For all other meta-data, we prefix column names with the dataset
    other_feature_data = purrr::map(list_names(sc), function(n) {
      df = sc[[n]][[a]][[]]
      if (contains_neccessary_columns[[n]]) df = df %>% dplyr::select(-dplyr::one_of(c("feature_id", "feature_name", "feature_type"), c()))
      if (ncol(df) > 0) colnames(df) = paste(n, colnames(df), sep=".")
      df$row_names = rownames(df)
      return(df)
    })
    
    # Now join everything by row_names by full outer join
    if (!is.null(feature_id_name_type)) {
      feature_data = purrr::reduce(c(list(feature_id_name_type=feature_id_name_type), other_feature_data), dplyr::full_join, by="row_names")
    } else {
      feature_data = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
    }
    rownames(feature_data) = feature_data$row_names
    feature_data$row_names = NULL
    
    return(feature_data)
  })
  
  # When merging, cell meta-data are merged but factors are not kept
  sc_cell_metadata = suppressWarnings(purrr::map_dfr(sc, function(s){ s[[]] }) %>% as.data.frame())
  sc_cell_metadata_factor_levels = purrr::map(which(sapply(sc_cell_metadata, is.factor)), function(n) {
    return(levels(sc_cell_metadata[, n, drop=TRUE]))
  })
}
# Standard method for integrating multiple samples. Best performance but computationally intensive.
if (param$integrate_samples[["method"]]=="integrate") {
  if (!"use_reciprocal_pca" %in% names(param$integrate_samples)) param$integrate_samples[["use_reciprocal_pca"]] = FALSE

  # Run integration
  sc = RunIntegration(sc, 
                      assay=param$norm,
                      ndims=param$integrate_samples[["dimensions"]], 
                      verbose=FALSE, 
                      reference=param$integrate_samples[["reference"]], 
                      use_reciprocal_pca=param$integrate_samples[["use_reciprocal_pca"]], 
                      k_filter=param$integrate_samples[["k.filter"]], 
                      k_weight=param$integrate_samples[["k.weight"]], 
                      k_anchor=param$integrate_samples[["k.anchor"]],
                      k_score=param$integrate_samples[["k.score"]])

  # Re-score cell cycle effects after integration
  if (param$cc_rescore_after_merge) {
    sc = CCScoring(sc=sc, genes_s=genes_s[,2], genes_g2m=genes_g2m[,2])
    if (any(is.na(sc[["S.Score"]])) | any(is.na(sc[["G2M.Score"]])))  {
      param$cc_remove=FALSE
      param$cc_remove_all=FALSE
      param$cc_rescore_after_merge=FALSE
      param$vars_to_regress = setdiff(param$vars_to_regress, c("S.Score", "G2M.Score", "CC.Difference"))
      param$latent_vars = setdiff(param$latent_vars, c("S.Score", "G2M.Score", "CC.Difference"))
    }
  }
  
  # (Re-)Run scaling
  if (param$norm == "RNA") {
    # According to Seurat, we need to scale data again for "RNAintegrated", and "RNA"
    DefaultAssay(sc) = "RNAintegrated"
    sc = Seurat::ScaleData(sc, 
                                features=rownames(sc[["RNAintegrated"]]), 
                                vars.to.regress=param$vars_to_regress, 
                                assay="RNAintegrated",
                                verbose=FALSE)
      
    DefaultAssay(sc) = "RNA"
    sc = Seurat::ScaleData(sc, 
                                features=rownames(sc[["RNA"]]), 
                                vars.to.regress=param$vars_to_regress, 
                                assay="RNA",
                                verbose=FALSE)
  } else if (param$norm == "SCT") {
    # We need to re-run SCTransform for the "SCT" assay again, to normalise on the complete dataset
    DefaultAssay(sc) = "SCT"
    min_cells_overall = max(purrr::map_int(param$feature_filter, function(f) as.integer(f[["min_cells"]])))
    sc = SCTransform(sc, vars.to.regress=param$vars_to_regress, 
                     min_cells=min_cells_overall,
                     verbose=FALSE,
                     return.only.var.genes=FALSE,
                     method=ifelse(packages_installed("glmGamPoi"), "glmGamPoi", "poisson"))
  }
  
  # Add feature metadata
  for (a in Seurat::Assays(sc)) {
    if (a %in% names(sc_feature_metadata)) {
      sc[[a]] = Seurat::AddMetaData(sc[[a]], sc_feature_metadata[[a]][rownames(sc[[a]]),, drop=FALSE])
    }
  }
  
  # Fix cell metadata factors
  for (f in names(sc_cell_metadata_factor_levels)) {
    sc[[f]] = factor(sc[[f, drop=TRUE]], levels=sc_cell_metadata_factor_levels[[f]])
  }
  
  # Add sample colours again to misc slot
  sc = ScAddLists(sc, lists=list(orig.ident=param$col_samples), lists_slot="colour_lists")

  # Set default assay (will be the integrated version)
  DefaultAssay(sc) = paste0(param$norm, "integrated")  
  
  message("Data values for all samples have been integrated.")
  print(sc)
}

× (Message)
Data values for all samples have been integrated.

## An object of class Seurat 
## 24312 features across 3918 samples within 2 assays 
## Active assay: RNAintegrated (3000 features, 3000 variable features)
##  1 other assay present: RNA

Relative log expression

n_cells_rle_plot = 100

# Sample at most 100 cells per dataset and save their identity
cells_subset = sc[["orig.ident"]] %>% tibble::rownames_to_column() %>% 
  dplyr::group_by(orig.ident) %>% 
  dplyr::sample_n(size=min(n_cells_rle_plot, length(orig.ident))) %>% 
  dplyr::select(rowname, orig.ident)

To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.

For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#374E55FF, #DF8F44FF)

Raw counts

# Plot raw data
p = PlotRLE(as.matrix(log2(GetAssayData(subset(sc, cells=cells_subset$rowname), assay="RNA", slot="counts") + 1)), 
            id=cells_subset$orig.ident, 
            col=param$col_samples) + 
  labs(title="log2(raw counts + 1)")
p

Normalised data

Dependent on the context, this tab refers to different data:

  • Single sample: RNA normalisation of the single sample
  • Multiple samples that were merged: Combined RNA normalisation post merging of all samples
  • Multiple samples that were integrated: Separate RNA normalisation prior to integration of all samples
# Plot normalised data
p = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=param$norm, slot="data")), 
            id=cells_subset$orig.ident, 
            col=param$col_samples) + 
  labs(title="Normalised data")
p

Integrated data

if (! param$integrate_samples[["method"]] %in% c("single", "merge")) {
  # Plot integrated data
  p = PlotRLE(as.matrix(GetAssayData(subset(sc, cells=cells_subset$rowname), assay=paste0(param$norm, "integrated"), slot="data")), 
              id=cells_subset$orig.ident, 
              col=param$col_samples) + 
    labs(title="Integrated data")
  p
} else {
  message("No integrated data available.")
}

Dimensionality reduction

A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.

We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.

To decide how many PCs to include in downstream analyses, we visualise the cells and genes that define the PCA.

# Run PCA for default normalisation
sc = Seurat::RunPCA(sc, features=Seurat::VariableFeatures(object=sc), verbose=FALSE, npcs=min(50, ncol(sc)))
p_list = Seurat::VizDimLoadings(sc, dims=1:2, reduction="pca", col=param$col, combine=FALSE, balanced=TRUE)
for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
p =  patchwork::wrap_plots(p_list, ncol=2) + patchwork::plot_annotation("Top gene loadings of the first two PCs") 
p

p = Seurat::DimPlot(sc, reduction="pca", cols=param$col_samples) + 
  AddStyle(title="Cells arranged by the first two PCs", legend_position="bottom")
p

p_list = Seurat::DimHeatmap(sc, dims=1:min(20, ncol(sc)), cells=min(500, ncol(sc)), balanced=TRUE, fast=FALSE, combine=FALSE)
p_list = purrr::map(seq(p_list), function(i) {
  p_list[[i]] = p_list[[i]] + 
    ggtitle(paste("PC", i)) + 
    theme(legend.position="none", axis.text.y=element_text(size=8))
  return(p_list[[i]])
  })
p = patchwork::wrap_plots(p_list, ncol=3) + patchwork::plot_annotation("Top gene loadings of the first PCs")
p

Dimensionality of the dataset

We next need to decide how many PCs we want to use for our analyses. The following “Elbow plot” is designed to help us make an informed decision. PCs are ranked based on the percentage of variance they explain.

# More approximate technique used to reduce computation time
p = Seurat::ElbowPlot(sc, ndims=min(20, ncol(sc))) + 
  geom_vline(xintercept=param$pc_n + .5, col="firebrick", lty=2) + 
  AddStyle(title="Elbow plot") 
p

# Cannot have more PCs than number of cells
param$pc_n = min(param$pc_n, ncol(sc))

For the current dataset, 10 PCs were chosen.

Clustering

Seurat’s clustering method first constructs a graph structure, where nodes are cells and edges are drawn between cells with similar gene expression patterns. Technically speaking, Seurat first constructs a K-nearest neighbor (KNN) graph based on Euclidean distance in PCA space, and refines edge weights between cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To partition the graph into highly interconnected parts, cells are iteratively grouped together using the Leiden algorithm (Traag, Waltman, and van Eck 2019).

# Construct phylogenetic tree relating the 'average' cell from each sample
if (length(levels(sc$orig.ident)) > 1) {
  sc = suppressWarnings(Seurat::BuildClusterTree(sc, features=rownames(sc), verbose=FALSE))
  Seurat::Misc(sc, "trees") = list(orig.ident = Seurat::Tool(sc, "Seurat::BuildClusterTree"))
}

# The number of clusters can be optimized by tuning 'resolution' -> based on feedback from the client whether or not clusters make sense
# Choose the number of PCs to use for clustering
sc = Seurat::FindNeighbors(sc, dims=1:param$pc_n, verbose=FALSE)

# Seurat vignette suggests resolution parameter between 0.4-1.2 for datasets of about 3k cells
sc = Seurat::FindClusters(sc, resolution=param$cluster_resolution, algorithm=4, verbose=FALSE, method="igraph")

# Construct phylogenetic tree relating the 'average' cell from each cluster
if (length(levels(sc$seurat_clusters)) > 1) {
  sc = suppressWarnings(Seurat::BuildClusterTree(sc, dims=1:param$pc_n, verbose=FALSE))
  suppressWarnings({Seurat::Misc(sc, "trees") = c(Seurat::Misc(sc, "trees"), list(seurat_clusters = Seurat::Tool(sc, "Seurat::BuildClusterTree")))})
}

# Set up colors for clusters and add the sc object
param$col_clusters = GenerateColours(num_colours=length(levels(sc$seurat_clusters)), names=levels(sc$seurat_clusters), 
                                     palette=param$col_palette_clusters, alphas=1)
sc = ScAddLists(sc, lists=list(seurat_clusters=param$col_clusters), lists_slot="colour_lists")

Visualisation with UMAP

We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.

Take care not to mis-read a UMAP:

  • Parameters influence the plot (we use defaults here)
  • Cluster sizes relative to each other mean nothing, since the method has a local notion of distance
  • Distances between clusters might not mean anything
  • You may need more than one plot

For a nice read to intuitively understand UMAP, see “Understanding UMAP” (2019).

Coloured by cluster

# Default UMAP
sc = suppressWarnings(Seurat::RunUMAP(sc, dims=1:param$pc_n, verbose=FALSE, umap.method="uwot"))
# Note that you can set `label = TRUE` or use the LabelClusters function to help label individual clusters
cluster_cells = table(sc@active.ident)
cluster_labels = paste0(levels(sc@active.ident)," (", cluster_cells[levels(sc@active.ident)],")")
p = Seurat::DimPlot(sc, reduction="umap", label=TRUE, label.box=TRUE) + 
  AddStyle(title="UMAP, cells coloured by cluster identity", legend_position="bottom", legend_title="Clusters") + 
  scale_color_manual(values=param$col_clusters, labels=cluster_labels) +
  scale_fill_manual(values=param$col_clusters, labels=cluster_labels) +
  theme(legend.position="none")

× (Message)
Scale for ‘fill’ is already present. Adding another scale for ‘fill,’ which
will replace the existing scale.

p

Coloured by sample

# Add a UMAP that is coloured by sample of origin
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% unique() %>% sort()

# Note: This is a hack to colour by sample but label by Cluster
p = Seurat::DimPlot(sc, reduction="umap", group.by="orig.ident", cols=param$col_samples) +
  AddStyle(title="UMAP, cells coloured by sample of origin", legend_position="bottom")
p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="lightgrey")
p

Distribution of cells in clusters

# Count cells per cluster per sample 
cell_samples = sc[[]] %>% dplyr::pull(orig.ident) %>% levels()
cell_clusters = sc[[]] %>% dplyr::pull(seurat_clusters) %>% levels()

tbl = dplyr::count(sc[[c("orig.ident", "seurat_clusters")]], orig.ident, seurat_clusters) %>% tidyr::pivot_wider(names_from="seurat_clusters", names_prefix="Cl_", values_from=n) %>% as.data.frame()
rownames(tbl) = paste0(tbl[,"orig.ident"],"_n")
tbl[,"orig.ident"] = NULL

# Add percentages
tbl_perc = round(t(tbl) / colSums(tbl) * 100, 2) %>% t()
rownames(tbl_perc) = gsub(rownames(tbl_perc), pattern="_n$", replacement="_perc", perl=TRUE)
tbl = rbind(tbl, tbl_perc)

# Add enrichment
if (length(cell_samples) > 1 & length(cell_clusters)>1) tbl = rbind(tbl, CellsFisher(sc))

# Sort
tbl = tbl[order(rownames(tbl)),, drop=FALSE]

# Plot percentages
tbl_bar = tbl[paste0(cell_samples, "_perc"), , drop=FALSE] %>% 
  tibble::rownames_to_column(var="Sample") %>%
  tidyr::pivot_longer(tidyr::starts_with("Cl"), names_to="Cluster", values_to="Percentage")
tbl_bar$Cluster = tbl_bar$Cluster %>% gsub(pattern="^Cl_", replacement="", perl=TRUE) %>% factor(levels=sc$seurat_clusters %>% levels())
tbl_bar$Sample = tbl_bar$Sample %>% gsub(pattern="_perc$", replacement="", perl=TRUE) %>% as.factor()
tbl_bar$Percentage = as.numeric(tbl_bar$Percentage)
p = ggplot(tbl_bar, aes(x=Cluster, y=Percentage, fill=Sample)) + 
  geom_bar(stat="identity" ) +
  AddStyle(title="Percentage cells of samples in clusters",
           fill=param$col_samples,
           legend_title="Sample",
           legend_position="bottom")
p

The following table shows the number of cells per sample per cluster:

  • n: Number of cells per sample per cluster
  • perc: Percentage of cells per sample per cluster compared to all other cells of that cluster

In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher or lower than expected:

  • oddsRatio: Odds ratio calculated for cluster c1 and sample s1 as (# cells s1 in c1 / # cells not s1 in c1) / (# cells s1 not in c1 / # cells not s1 not in c1)
  • p: P-value calculated with a Fisher test to test whether “n” is higher or lower than expected
# Print table
knitr::kable(tbl, align="l", caption="Number of cells per cluster per sample") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
  kableExtra::scroll_box(width="100%")
Number of cells per cluster per sample
Cl_1 Cl_2 Cl_3 Cl_4 Cl_5 Cl_6 Cl_7 Cl_8 Cl_9 Cl_10
pbmc_10x.oddsRatio 1 1.51 1.19 0.73 1.53 0.71 1.16 1.32 0.25 0.55
pbmc_10x.p 5.4e-01 1.3e-02 1.7e-01 9.8e-01 2.8e-02 9.8e-01 3.2e-01 2.9e-01 1.0e+00 9.4e-01
pbmc_10x_n 721 582 569 493 410 369 254 122 49 39
pbmc_10x_perc 92.08 94.33 93.13 89.96 94.47 89.56 93.04 93.85 75.38 86.67
pbmc_smartseq2_sample1.oddsRatio 1 0.66 0.84 1.36 0.65 1.41 0.86 0.76 3.95 1.81
pbmc_smartseq2_sample1.p 5.2e-01 9.9e-01 8.7e-01 3.1e-02 9.8e-01 3.1e-02 7.6e-01 8.2e-01 2.9e-05 1.4e-01
pbmc_smartseq2_sample1_n 62 35 42 55 24 43 19 8 16 6
pbmc_smartseq2_sample1_perc 7.92 5.67 6.87 10.04 5.53 10.44 6.96 6.15 24.62 13.33
# Reset default assay, so we won't plot integrated data
# Note: We need integrated data for UMAP, clusters
DefaultAssay(sc) = param$norm

Cell Cycle Effect

How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.

# Set up colours for cell cycle effect and add to sc object
col =  GenerateColours(num_colours=length(levels(sc$Phase)), names=levels(sc$Phase), palette="ggsci::pal_npg", alphas=1)
sc = ScAddLists(sc, lists=list(Phase=col), lists_slot="colour_lists")

# Get a feeling for how many cells are affected
p1 = ggplot(sc[[]], aes(x=S.Score, y=G2M.Score, colour=Phase)) + 
  geom_point() + 
  scale_x_continuous("G1/S score") + 
  scale_y_continuous("G2/M score") + 
  AddStyle(col=Misc(sc, "colour_lists")[["Phase"]])

p2 = ggplot(sc@meta.data %>% 
              dplyr::group_by(seurat_clusters, Phase) %>% 
              dplyr::summarise(num_cells=length(Phase)), 
            aes(x=seurat_clusters, y=num_cells, fill=Phase)) + 
  geom_bar(stat="identity", position="fill") + 
  scale_x_discrete("Seurat clusters") + 
  scale_y_continuous("Fraction of cells") + 
  AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]])

p3 = ggplot(sc[[]] %>% 
              dplyr::group_by(orig.ident, Phase) %>% 
              dplyr::summarise(num_cells=length(Phase)), 
            aes(x=orig.ident, y=num_cells, fill=Phase)) + 
  geom_bar(stat="identity", position="fill") + 
  scale_y_continuous("Fraction of cells") +
  AddStyle(fill=Misc(sc, "colour_lists")[["Phase"]]) + 
  theme(axis.text.x = element_text(angle=30, hjust=1, vjust=1)) + xlab("")

p = p1 + p2 + p3 & theme(legend.position="bottom")
p = p + patchwork::plot_annotation(title="Cell cycle phases") + plot_layout(guides="collect")
p

UMAP coloured by cell cycle phases

if (any(!is.na(sc$Phase))) {
  # UMAP with phases superimposed
  # Note: This is a hack to colour by phase but label by Cluster
  p = Seurat::DimPlot(sc, group.by="Phase", pt.size=1, cols=Misc(sc, "colour_lists")[["Phase"]]) + 
    AddStyle(title="UMAP, cells coloured by cell cycle phases", legend_title="Phase")
  p$data$seurat_clusters = sc[["seurat_clusters"]][rownames(p$data), ]
  p = LabelClusters(p, id="seurat_clusters", box=TRUE, segment.color="black", fill="lightgrey")
  p
}

UMAP coloured by S phase

if (any(!is.na(sc$Phase))) {
  p = Seurat::FeaturePlot(sc, features="S.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
    AddStyle(title="UMAP, cells coloured by S phase")
  p = LabelClusters(p, id="ident", box=TRUE, segment.color="black", fill="lightgrey")
  p
}

UMAP coloured by G2/M phase

if (any(!is.na(sc$Phase))) {
  p = Seurat::FeaturePlot(sc, features="G2M.Score", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) + 
    AddStyle(title="UMAP, cells coloured by G2M phase")
  p = LabelClusters(p, id="ident", box=TRUE, segment.color="black", fill="lightgrey")
  p
}

UMAP coloured by the difference between S and G2/M phase

if (any(!is.na(sc$Phase))) {
  p = Seurat::FeaturePlot(sc, features="CC.Difference", pt.size=1, min.cutoff="q1", max.cutoff="q99", cols=c("lightgrey", param$col)) +
    AddStyle(title="UMAP, cells coloured by CC.Difference")
  p = LabelClusters(p, id="ident", box=TRUE, segment.color="black", fill="lightgrey")
  p
}

Cluster QC

Do cells in individual clusters have particularly high counts, detected genes or mitochondrial content?

Number of counts

p1 = suppressMessages(Seurat::FeaturePlot(sc, features="nCount_RNA") + 
  AddStyle(title="Feature plot") + 
  scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p1 = LabelClusters(p1, id="ident", box=TRUE, segment.color="black", fill="lightgrey")


p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=nCount_RNA, fill=seurat_clusters, group=seurat_clusters)) + 
  geom_violin(scale="width") + 
  AddStyle(title="Violin plot (log10 scale)", fill=param$col_clusters,
           xlab="Cluster", legend_position="none") + 
  scale_y_log10()

p = p1 | p2 
p = p + patchwork::plot_annotation(title="Summed raw counts (nCount_RNA, log10 scale)")
p

Number of features

p1 = suppressMessages(Seurat::FeaturePlot(sc, features="nFeature_RNA") + 
  AddStyle(title="Feature plot") + 
  scale_colour_gradient(low="lightgrey", high=param$col, trans="log10"))
p1 = LabelClusters(p1, id="ident", box=TRUE, segment.color="black", fill="lightgrey")

p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=nFeature_RNA, fill=seurat_clusters, group=seurat_clusters)) + 
  geom_violin(scale="width") + 
  AddStyle(title="Violin plot", fill=param$col_clusters,
           xlab="Cluster", legend_position="none") + 
  scale_y_log10()

p = p1 | p2 
p = p + patchwork::plot_annotation(title="Number of features with raw count > 0 (nFeature_RNA, log10 scale)")
p

Percent mitochondrial reads

p1 = Seurat::FeaturePlot(sc, features="percent_mt", cols=c("lightgrey", param$col)) + 
  AddStyle(title="Feature plot")
p1 = LabelClusters(p1, id="ident", box=TRUE, segment.color="black", fill="lightgrey")

p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_mt, fill=seurat_clusters, group=seurat_clusters)) + 
  geom_violin(scale="width") + 
  AddStyle(title="Violin plot", fill=param$col_clusters,
           xlab="Cluster", legend_position="none")
p = p1 | p2 
p = p + patchwork::plot_annotation(title="Percent of mitochondrial features (percent_mt)")
p

Percent ribosomal reads

p1 = Seurat::FeaturePlot(sc, features="percent_ribo", cols=c("lightgrey", param$col)) +
  AddStyle(title="Feature plot")
p1 = LabelClusters(p1, id="ident", box=TRUE, segment.color="black", fill="lightgrey")

p2 = ggplot(sc[[]], aes(x=seurat_clusters, y=percent_ribo, fill=seurat_clusters, group=seurat_clusters)) +
  geom_violin(scale="width") +
  AddStyle(title="Violin plot", fill=param$col_clusters,
           xlab="Cluster", legend_position="none")
p = p1 | p2
p = p + patchwork::plot_annotation(title="Percent of ribosomal features (percent_ribo)")
p

Known marker genes

Do cells in individual clusters express provided known marker genes?

known_markers_list=c()

# Overwrite empty list of known markers 
if (!is.null(param$file_known_markers)) {
  # Read known marker genes and map to rownames
  known_markers = openxlsx::read.xlsx(param$file_known_markers)
  known_markers_list = lapply(colnames(known_markers), function(x) {
    y = ensembl_to_seurat_rowname[known_markers[,x]] %>% 
      na.exclude() %>% unique() %>% sort()
    m = !y %in% rownames(sc)
    if (any(m)){
      Warning(paste0("The following genes of marker list '", x, "' cannot be found in the data: ", first_n_elements_to_string(y[m], n=10)))
    }
    return(y[!m])
  })
  
  # Remove empty lists
  names(known_markers_list) = colnames(known_markers)
  is_empty = purrr::map_int(known_markers_list, .f=length) == 0 
  known_markers_list = known_markers_list[!is_empty]
  
  # Add lists to sc object
  sc = ScAddLists(sc, lists=setNames(known_markers_list, paste0("known_marker_", names(known_markers_list))), lists_slot="gene_lists")
}  

# Set plot options
if(length(known_markers_list) > 0) { 
  known_markers_n = length(known_markers_list) 
  known_markers_vect = unlist(known_markers_list) %>% unique() %>% sort()
  idx_dotplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) <= 50)
  idx_avgplot = sapply(seq(known_markers_list), function(x) length(known_markers_list[[x]]) >= 10)
} else { 
  known_markers_n=0
  idx_dotplot = idx_avgplot = FALSE
  known_markers_vect = c()
}
# Dotplots and average feature plots
# The height of 1 row (= 1 plot) is fixed to 5 
fig_height_knownMarkers_dotplot = max(5, 5 * sum(idx_dotplot))
fig_height_knownMarkers_avgplot = max(5, 5 * sum(idx_avgplot))

# Individual feature plots
# Each row contains 2 plots
# We fix the height of each plot to the same height as is used later for DEGs
height_per_row = max(2, 0.3 * length(levels(sc$seurat_clusters)))
nr_rows = ceiling(length(known_markers_vect)/2)
fig_height_knownMarkers_vect = max(5, height_per_row * nr_rows)

You provided 7 list(s) of known marker genes. In the following tabs, you find:

  • Dot plots for all gene lists containing at most 50 genes
  • Average feature plots for all gene lists containing at least 10 genes
  • Individual feature plots for all genes if there are no more than 100 genes in total

Dot plot(s)

A dot plot visualises how gene expression changes across different clusters. The size of a dot encodes the percentage of cells in a cluster that expresses the gene, while the color encodes the scaled average expression across all cells within the cluster. Per gene, we group cells based on cluster identity, calculate average expression per cluster, subtract the mean of average expression values and divide by the standard deviation. The resulting scores describe how high or low a gene is expressed in a cluster compared to all other clusters.

if ((known_markers_n > 0) & any(idx_dotplot)) {
  known_markers_dotplot = known_markers_list[idx_dotplot]
  p_list = list()
  for (i in seq(known_markers_dotplot)) {
    g = known_markers_dotplot[[i]]
    g = g[length(g):1]
    p_list[[i]] = suppressMessages(
      Seurat::DotPlot(sc, features=g) + 
        scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
        AddStyle(title=paste("Known marker genes:", names(known_markers_dotplot)[i]), ylab="Cluster") + 
        theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) +
        lims(size=c(0,100)) +
        guides(color = FALSE)
      )
  }
  p = patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_layout(guides="collect")
  p
} else if ((known_markers_n > 0) & !any(idx_dotplot)) {
  message("This tab is used for dot plots for up to 50 genes. All provided lists are longer than this, and hence dot plots are skipped.")
} else {
  message("No known marker genes were provided and hence dot plots are skipped.")
}

Average feature plot(s)

An average feature plot visualises the average gene expression of each gene list on a single-cell level, subtracted by the aggregated expression of control feature sets. The color of the plot encodes the calculated scores, whereat positive scores suggest that genes are expressed more highly than expected.

if ((known_markers_n > 0) & any(idx_avgplot)) {
  known_markers_avgplot = known_markers_list[idx_avgplot]
  sc = Seurat::AddModuleScore(sc, features=known_markers_avgplot, ctrl=10, name="known_markers")
  idx_replace_names = grep("^known_markers[0-9]+$", colnames(sc@meta.data), perl=TRUE)
  colnames(sc@meta.data)[idx_replace_names] = names(known_markers_avgplot)
  p_list = Seurat::FeaturePlot(sc, features=names(known_markers_avgplot), cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
  for (i in seq(known_markers_avgplot)) {
    p_list[[i]] = p_list[[i]] + AddStyle(title=paste("Known marker genes:", names(known_markers_avgplot)[i]))
  }
  p = patchwork::wrap_plots(p_list, ncol=1)
  print(p)
} else if ((known_markers_n > 0) & !any(idx_avgplot)) {
  message("This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.")
} else {
  message("No known marker genes were provided and hence average feature plots are skipped.")
}

× (Message)
This tab is used to plot an average for 10 or more genes. All provided lists are shorter than this, and hence average feature plots are skipped.

Individual feature plots

An individual feature plot colours single cells on the UMAP according to their normalised gene expression.

if ((known_markers_n > 0) & length(known_markers_vect) <= 100) {
  p_list = Seurat::FeaturePlot(sc, features=known_markers_vect, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
  for (i in seq(p_list)) p_list[[i]] = p_list[[i]] + AddStyle()
  p = patchwork::wrap_plots(p_list, ncol=2)
  print(p)
} else if (length(known_markers_vect) > 100) { 
  message("This tab is used to plot up to 100 known marker genes. Your provided list is longer than this, and hence individual feature plots are skipped.")
} else {
  message("No known marker genes were provided and hence individual feature plots are skipped.")
}

Marker genes

We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on raw “RNA” data and the method “MAST.” Resulting p-values are adjusted using the Bonferroni method. However, note that the p-values are likely inflated, since both clusters and marker genes were determined based on the same gene expression data, and there ought to be gene expression differences by design. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.

# Find DEGs for every cluster compared to all remaining cells, report positive (=markers) and negative ones
# min.pct = requires feature to be detected at this minimum percentage in either of the two groups of cells 
# logfc.threshold = requires a feature to be differentially expressed on average by some amount between the two groups
# only.pos = find only positive markers 

# Review recommends using "MAST"; Mathias uses "LR"
# ALWAYS USE: assay="RNA" or assay="SCT"
# DONT USE: assay=integrated datasets; this data is normalised and contains only 2k genes
# Note: By default, the function uses slot="data". Mast requires log data, so this is the correct way to do it.
#   https://www.bioconductor.org/packages/release/bioc/vignettes/MAST/inst/doc/MAST-interoperability.html
markers = suppressMessages(Seurat::FindAllMarkers(sc, assay="RNA", test.use="MAST",
                                               only.pos=FALSE, min.pct=param$marker_pct, logfc.threshold=param$marker_log2FC,
                                               latent.vars=param$latent_vars, verbose=FALSE, silent=TRUE))

# If no markers were found, initialise the degs table so that further downstream (export) chunks run
if (ncol(markers)==0) markers = DegsEmptyMarkerResultsTable(levels(sc$seurat_clusters))

# For Seurat versions until 3.2, log fold change is based on the natural log. Convert to log base 2.
if ("avg_logFC" %in% colnames(markers) & !"avg_log2FC" %in% colnames(markers)) {
  lfc_idx = grep("avg_log\\S*FC", colnames(markers))
  markers[,lfc_idx] = marker_deg_results[,lfc_idx] / log(2)
  col_nms = colnames(markers)
  col_nms[2] = "avg_log2FC"
  colnames(markers) = col_nms
}

# Sort markers
markers = markers %>% DegsSort(group=c("cluster"))
  
# Filter markers 
markers_filt = DegsFilter(markers, cut_log2FC=param$marker_log2FC, cut_padj=param$marker_padj)
markers_found = nrow(markers_filt$all)>0

# Add average data to table
markers_out = cbind(markers_filt$all, DegsAvgDataPerIdentity(sc, genes=markers_filt$all$gene))

# Split by cluster and write to file
additional_readme = data.frame(Column=c("cluster",
                                        "p_val_adj_score",
                                        "avg_<assay>_<slot>_id<cluster>"), 
                               Description=c("Cluster",
                                             "Score calculated as follows: -log10(p_val_adj)*sign(avg_log2FC)",
                                             "Average expression value for cluster; <assay>: RNA or SCT; <slot>: raw counts or normalised data"))

invisible(DegsWriteToFile(split(markers_out, markers_out$cluster),
                                       annot_ensembl=annot_ensembl,
                                       gene_to_ensembl=seurat_rowname_to_ensembl,
                                       additional_readme=additional_readme,
                                       file=file.path(param$path_out, "marker_degs", "markers_cluster_vs_rest.xlsx")))


# Plot number of differentially expressed genes
p = DegsPlotNumbers(markers_filt$all, 
                      group="cluster", 
                      title=paste0("Number of DEGs, comparing each cluster to the rest\n(FC=", 2^param$marker_log2FC, ", adj. p-value=", param$marker_padj, ")")) 

# Add marker table to seurat object
Seurat::Misc(sc, "markers") = list(condition_column="seurat_clusters", test="MAST", padj=param$marker_padj, 
                                   log2FC=param$marker_log2FC, min_pct=param$marker_pct, assay="RNA", slot="data",
                                   latent_vars=param$latent_vars,
                                   results=markers_filt$all)

# Add marker lists to seurat object
marker_genesets_up = split(markers_filt$up$gene, markers_filt$up$cluster)
names(marker_genesets_up) = paste0("markers_up_cluster", names(marker_genesets_up))
marker_genesets_down = split(markers_filt$down$gene, markers_filt$down$cluster)
names(marker_genesets_down) = paste0("markers_down_cluster", names(marker_genesets_down))
sc = ScAddLists(sc, lists=c(marker_genesets_up, marker_genesets_down), lists_slot="gene_lists")

if (markers_found) {
  p
} else {
  warning("No differentially expressed genes (cluster vs rest) found. The following related code is not executed, no related plots and tables are generated.")
}

Table of top marker genes

We use the term “marker genes” to specifically describe genes that are up-regulated in cells of one cluster compared to the rest.

if (markers_found) {
  markers_top = DegsUpDisplayTop(markers_filt$up, n=5)
  
  # Add labels
  markers_top$labels = paste0(markers_top$cluster, ": ", markers_top$gene)

  # Show table
  knitr::kable(markers_top, align="l", caption="Up to top 5 marker genes per cell cluster") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%", height="700px") 
}
Up to top 5 marker genes per cell cluster
cluster gene avg_log2FC p_val p_val_adj pct.1 pct.2 labels
1 GZMH 2.013 0.0e+00 0.0e+00 0.978 0.263 1: GZMH
1 NKG7 1.715 1.8e-300 3.8e-296 1.000 0.511 1: NKG7
1 FGFBP2 1.893 3.2e-298 6.8e-294 0.921 0.235 1: FGFBP2
1 CCL5.1 1.454 6.7e-266 1.4e-261 1.000 0.487 1: CCL5.1
1 CD8A 1.918 9.9e-249 2.1e-244 0.854 0.276 1: CD8A
2 IL7R 1.768 3.7e-238 7.8e-234 0.935 0.293 2: IL7R
2 LTB 1.831 5.2e-226 1.1e-221 0.925 0.338 2: LTB
2 LDHB 1.201 3.0e-156 6.4e-152 0.947 0.622 2: LDHB
2 ZFP36L2 1.300 7.6e-143 1.6e-138 0.985 0.834 2: ZFP36L2
2 LEPROTL1 1.113 8.7e-129 1.9e-124 0.922 0.533 2: LEPROTL1
3 GZMK 2.456 3.7e-252 7.9e-248 0.642 0.046 3: GZMK
3 DUSP2 1.741 9.0e-186 1.9e-181 0.884 0.402 3: DUSP2
3 IL7R 1.018 4.8e-105 1.0e-100 0.792 0.321 3: IL7R
3 PIK3R1 1.090 3.6e-99 7.7e-95 0.822 0.490 3: PIK3R1
3 TNFAIP3 1.179 9.1e-85 1.9e-80 0.866 0.558 3: TNFAIP3
4 GNLY 3.426 0.0e+00 0.0e+00 0.965 0.275 4: GNLY
4 CTSW 1.890 4.7e-255 1.0e-250 0.982 0.462 4: CTSW
4 GZMB 2.108 1.7e-249 3.7e-245 0.934 0.258 4: GZMB
4 PRF1 2.226 7.0e-244 1.5e-239 0.942 0.330 4: PRF1
4 KLRF1 1.876 2.2e-231 4.7e-227 0.589 0.032 4: KLRF1
5 CD79A 3.997 0.0e+00 0.0e+00 1.000 0.018 5: CD79A
5 HLA-DQA1 3.049 0.0e+00 0.0e+00 0.979 0.029 5: HLA-DQA1
5 MS4A1 2.844 0.0e+00 0.0e+00 0.878 0.016 5: MS4A1
5 LINC00926 2.627 0.0e+00 0.0e+00 0.871 0.019 5: LINC00926
5 BANK1 2.392 0.0e+00 0.0e+00 0.836 0.011 5: BANK1
6 S100A9 6.058 0.0e+00 0.0e+00 0.998 0.062 6: S100A9
6 VCAN 4.354 0.0e+00 0.0e+00 0.988 0.027 6: VCAN
6 FCN1 3.962 0.0e+00 0.0e+00 0.993 0.054 6: FCN1
6 CD14 3.459 0.0e+00 0.0e+00 0.951 0.009 6: CD14
6 CLEC7A 2.697 0.0e+00 0.0e+00 0.959 0.056 6: CLEC7A
7 LEF1 1.740 3.8e-153 8.1e-149 0.755 0.077 7: LEF1
7 CCR7 1.768 4.4e-142 9.3e-138 0.769 0.094 7: CCR7
7 TCF7 1.369 4.7e-94 1.0e-89 0.857 0.314 7: TCF7
7 IL7R 1.093 9.7e-89 2.1e-84 0.945 0.353 7: IL7R
7 TRABD2A 1.245 8.7e-87 1.9e-82 0.564 0.077 7: TRABD2A
8 LST1 3.941 7.6e-245 1.6e-240 1.000 0.198 8: LST1
8 IFITM3 3.400 1.2e-208 2.5e-204 0.992 0.268 8: IFITM3
8 FCGR3A 3.363 5.1e-200 1.1e-195 1.000 0.152 8: FCGR3A
8 AIF1 3.580 1.5e-186 3.2e-182 1.000 0.166 8: AIF1
8 CDKN1C.1 3.276 6.2e-185 1.3e-180 0.923 0.020 8: CDKN1C.1
9 RGS10 4.240 6.8e-282 1.5e-277 0.769 0.407 9: RGS10
9 MAX 4.331 5.2e-260 1.1e-255 0.862 0.292 9: MAX
9 TAGLN2 4.238 4.2e-255 9.0e-251 0.938 0.709 9: TAGLN2
9 OST4 3.386 6.2e-219 1.3e-214 0.862 0.792 9: OST4
9 TUBA4A 4.482 1.4e-208 2.9e-204 0.846 0.403 9: TUBA4A
10 HLA-DPB1.2 3.420 6.9e-83 1.5e-78 1.000 0.570 10: HLA-DPB1.2
10 HLA-DPA1.2 3.619 4.3e-81 9.2e-77 1.000 0.549 10: HLA-DPA1.2
10 FCER1A 2.951 1.0e-58 2.1e-54 0.756 0.009 10: FCER1A
10 CD74 3.195 7.9e-55 1.7e-50 1.000 0.775 10: CD74
10 PLD4 2.288 4.3e-51 9.2e-47 0.889 0.042 10: PLD4

Visualisation of top marker genes

The following plots visualise the top marker genes for each cluster, respectively. Clear marker genes indicate good clusters that represent cell types.

# Note: We need to run this chunk as it specifies a variable that is used in chunk definitions below
if (markers_found) {

  # Feature plots and violin plots: each row contains 5 plots
  #   Row height is not dependent on the number of clusters
  #   The plot has 5 columns and 1 row per cluster, hence the layout works nicely if we find 
  #     at least 5 markers per cluster
  nr_rows_5cols = ceiling(nrow(markers_top)/5)
  fig_height_5cols = nr_rows_5cols * 3
  
  # Dotplots: each row contains 2 plots
  #   Row height is dependent on the number of clusters 
  nr_rows_dp_2cols = ceiling(length(levels(sc$seurat_clusters))/2)
  fig_height_dp_2cols = nr_rows_dp_2cols * max(4, 0.5 * length(levels(sc$seurat_clusters)))
  
} else {
  fig_height_5cols = fig_height_dp_2cols = 7 
}

Feature plots

if (markers_found) {
  # Plot each marker one by one, and then combine them all at the end
  p_list = list()
  for (i in 1:nrow(markers_top)) { 
    p_list[[i]] = Seurat::FeaturePlot(sc, features=markers_top$gene[i], 
                                      cols=c("lightgrey", param$col_clusters[markers_top$cluster[i]]),  
                                      combine=TRUE, label=TRUE) + 
      AddStyle(title=markers_top$labels[i], 
               xlab="", ylab="", 
               legend_position="bottom")
  }
  
  # Combine all plots
  p = patchwork::wrap_plots(p_list, ncol=5) + 
    patchwork::plot_annotation(title="UMAP, cells coloured by normalised gene expression data, top marker genes per cluster")
  p
}

Violin plots (normalised)

if (markers_found) {
  # Plot violin plots per marker gene, and combine it all at the end
  # This layout works out nicely if there are 5 marker genes per cluster
  p_list = list()
  for(i in 1:nrow(markers_top)) { 
    p_list[[i]] = Seurat::VlnPlot(sc, features=markers_top$gene[i], assay="RNA", pt.size=0, cols=param$col_clusters) + 
      AddStyle(title=markers_top$labels[i], xlab="")
  }
  p = patchwork::wrap_plots(p_list, ncol=5) + 
    patchwork::plot_annotation(title="Violin plot of for normalised gene expression data, top marker genes per cluster") & theme(legend.position="none")
  p
}

Dot plot (scaled)

if (markers_found) {
  # Visualises how feature expression changes across different clusters
  # Plot dotplots per cluster, and combine it all at the end
  p_list = lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
    genes = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
    p = suppressMessages(Seurat::DotPlot(sc, features=genes) + 
                           scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") +
                           AddStyle(title=paste0("Top marker genes for cluster ", cl, " (scaled)"), ylab="Cluster", legend_position="bottom") + 
                           theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + 
                           guides(size=guide_legend(order=1)))
    return(p)
  })
  
  p = patchwork::wrap_plots(p_list, ncol=2) 
  p
}

Dot plot (non-scaled)

if (markers_found) {
  # Visualises how feature expression changes across different clusters
  # Plot dotplots per cluster, and combine it all at the end
  p_list = lapply(markers_top$cluster %>% sort() %>% unique(), function(cl) {
    genes = markers_top %>% dplyr::filter(cluster==cl) %>% dplyr::pull(gene)
    genes = genes[length(genes):1]
    p = suppressMessages(DotPlotUpdated(sc, features=genes, scale=FALSE, cols=c("lightgrey", param$col)) + 
                           AddStyle(title=paste0("Top marker genes for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") + 
                           theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5)) + 
                           guides(size=guide_legend(order=1)))
    return(p)
  })
      
  p = patchwork::wrap_plots(p_list, ncol=2)
  p
}

Expression per cluster per sample

If the dataset contains multiple samples, we can visualise the expression of a gene that is up-regulated in a cluster separately for each sample. For each cluster, we extract up-regulated genes, and visualise expression of these genes in all cells in that cluster, split by their sample of origin.

fig_height_degs_per_cl = max(5, 
                             max(2, 0.3 * (sc$orig.ident %>% unique() %>% length())) * length(levels(sc$seurat_clusters)) * 2)

Scaled dotplots

First, we plot scaled expression as explained above (see section Known marker genes). This plot allows us to judge whether the expression of a gene is increased in one sample as compared to the other samples.

if (markers_found) {
  p_list = list()
  markers_filt_up_top = DegsUpDisplayTop(degs=markers_filt$up, n=50)
  for (cl in levels(sc$seurat_clusters)) {  
    markers_filt_up_cl_top = markers_filt_up_top %>% 
      dplyr::filter(cluster==cl) %>% 
      dplyr::pull(gene)

    if (length(markers_filt_up_cl_top) > 0) {
      p_list[[cl]] = suppressMessages(Seurat::DotPlot(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident") +
        scale_colour_gradient2(low="steelblue", mid="lightgrey", high="darkgoldenrod1") + 
        AddStyle(title=paste0("Up to 50 markers (up-regulated genes) for cluster ", cl), ylab="Cluster", legend_position="bottom") + 
        theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) + 
        guides(size=guide_legend(order=1)))
    }
  }
  p = patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster") 
  p
}

Non-scaled dotplots

Second, we plot normalised expression with no further scaling. This plot helps to get an impression of the total expression of a gene.

n_genes_max_dotplot = 50
p_list = list()
for (cl in levels(sc$seurat_clusters)) {            
  markers_filt_up_cl_top = markers_filt$up %>% 
    dplyr::filter(cluster==cl) %>% 
    dplyr::top_n(n=n_genes_max_dotplot, wt=p_val_adj_score) %>% 
    dplyr::pull(gene)
  if (length(markers_filt_up_cl_top) > 0) {
    p_list[[cl]] = DotPlotUpdated(sc, features=markers_filt_up_cl_top, idents=cl, group.by="orig.ident", scale=FALSE, cols=c("lightgrey", param$col)) +
      AddStyle(title=paste0("Up to ", n_genes_max_dotplot, " markers (up-regulated genes) for cluster ", cl, " (not scaled)"), ylab="Cluster", legend_position="bottom") + 
      theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5)) + 
      guides(size=guide_legend(order=1))
  }
  p = patchwork::wrap_plots(p_list, ncol=1) + patchwork::plot_annotation("Dotplot per cluster (not scaled)") 
  p
}

Heatmaps

All up- and down-regulated genes

if (markers_found) {
  # This will sample 500 cells; the number of cells per seurat_cluster will be proportional
  cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
    
  # Heatmap of all differentially expressed genes
  p = Seurat::DoHeatmap(sc, features=markers_filt$all$gene, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) + 
    NoLegend() + 
    theme(axis.text.y=element_blank()) +
    ggtitle("Heatmap of scaled gene expression data, all genes differentially expressed between a cluster and the rest")
  p
}

Top 300 up-regulated genes

if (markers_found) {
  # This will sample 500 cells; the number of cells per seurat_cluster will be proportional
  cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
  # With fig.height = 20, 300 features can be shown; distribute among clusters
  features_per_group = 300/length(levels(markers_filt$up$cluster))
  features_subset = markers_filt$up %>% 
    dplyr::group_by(cluster) %>% 
    dplyr::top_n(n=features_per_group, wt=avg_log2FC) %>% 
    dplyr::arrange(cluster, -avg_log2FC) %>%
    dplyr::pull(gene) %>%
    unique()
  
  # Heatmap of top up-regulated genes
  p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) + 
    NoLegend() + 
    theme(axis.text.y=element_text(size=8)) +
    ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
  p
}

Top 300 down-regulated genes

if (markers_found) {
  # This will sample 500 cells; the number of cells per seurat_cluster will be proportional
  cells_subset = ScSampleCells(sc, n=500, group="seurat_clusters", group_proportional=TRUE, seed=1)
  # With fig.height = 20, 300 features can be shown; distribute among clusters
  features_per_group = 300/length(levels(markers_filt$down$cluster))
  features_subset = markers_filt$down %>% 
    dplyr::group_by(cluster) %>% 
    dplyr::top_n(n=features_per_group, wt=-avg_log2FC) %>% 
    dplyr::arrange(cluster, avg_log2FC) %>%
    dplyr::pull(gene) %>%
    unique()
  
  # Heatmap of top down-regulated genes
  p = Seurat::DoHeatmap(sc, features=features_subset, group.colors=param$col_clusters, label=FALSE, cells=cells_subset) + 
    NoLegend() + 
    theme(axis.text.y=element_text(size=8)) +
    ggtitle("Heatmap of scaled gene expression data, top genes up-regulated in a cluster compared to the rest")
  p
}

Functional enrichment analysis

To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.

We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (Chen 2020). You can choose to test functional enrichment from a wide range of databases:

dbs_all = enrichR::listEnrichrDbs()
knitr::kable(dbs_all, align="l", caption="Enrichr databases") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
  kableExtra::scroll_box(width="100%", height="300px")
Enrichr databases
geneCoverage genesPerTerm libraryName link numTerms
13362 275 Genome_Browser_PWMs http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ 615
27884 1284 TRANSFAC_and_JASPAR_PWMs http://jaspar.genereg.net/html/DOWNLOAD/ 326
6002 77 Transcription_Factor_PPIs 290
47172 1370 ChEA_2013 http://amp.pharm.mssm.edu/lib/cheadownload.jsp 353
47107 509 Drug_Perturbations_from_GEO_2014 http://www.ncbi.nlm.nih.gov/geo/ 701
21493 3713 ENCODE_TF_ChIP-seq_2014 http://genome.ucsc.edu/ENCODE/downloads.html 498
1295 18 BioCarta_2013 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 249
3185 73 Reactome_2013 http://www.reactome.org/download/index.html 78
2854 34 WikiPathways_2013 http://www.wikipathways.org/index.php/Download_Pathways 199
15057 300 Disease_Signatures_from_GEO_up_2014 http://www.ncbi.nlm.nih.gov/geo/ 142
4128 48 KEGG_2013 http://www.kegg.jp/kegg/download/ 200
34061 641 TF-LOF_Expression_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 269
7504 155 TargetScan_microRNA http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 222
16399 247 PPI_Hub_Proteins http://amp.pharm.mssm.edu/X2K 385
12753 57 GO_Molecular_Function_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 1136
23726 127 GeneSigDB https://pubmed.ncbi.nlm.nih.gov/22110038/ 2139
32740 85 Chromosome_Location http://software.broadinstitute.org/gsea/msigdb/index.jsp 386
13373 258 Human_Gene_Atlas http://biogps.org/downloads/ 84
19270 388 Mouse_Gene_Atlas http://biogps.org/downloads/ 96
13236 82 GO_Cellular_Component_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 641
14264 58 GO_Biological_Process_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 5192
3096 31 Human_Phenotype_Ontology http://www.human-phenotype-ontology.org/ 1779
22288 4368 Epigenomics_Roadmap_HM_ChIP-seq http://www.roadmapepigenomics.org/ 383
4533 37 KEA_2013 http://amp.pharm.mssm.edu/lib/keacommandline.jsp 474
10231 158 NURSA_Human_Endogenous_Complexome https://www.nursa.org/nursa/index.jsf 1796
2741 5 CORUM http://mips.helmholtz-muenchen.de/genre/proj/corum/ 1658
5655 342 SILAC_Phosphoproteomics http://amp.pharm.mssm.edu/lib/keacommandline.jsp 84
10406 715 MGI_Mammalian_Phenotype_Level_3 http://www.informatics.jax.org/ 71
10493 200 MGI_Mammalian_Phenotype_Level_4 http://www.informatics.jax.org/ 476
11251 100 Old_CMAP_up http://www.broadinstitute.org/cmap/ 6100
8695 100 Old_CMAP_down http://www.broadinstitute.org/cmap/ 6100
1759 25 OMIM_Disease http://www.omim.org/downloads 90
2178 89 OMIM_Expanded http://www.omim.org/downloads 187
851 15 VirusMINT http://mint.bio.uniroma2.it/download.html 85
10061 106 MSigDB_Computational http://www.broadinstitute.org/gsea/msigdb/collections.jsp 858
11250 166 MSigDB_Oncogenic_Signatures http://www.broadinstitute.org/gsea/msigdb/collections.jsp 189
15406 300 Disease_Signatures_from_GEO_down_2014 http://www.ncbi.nlm.nih.gov/geo/ 142
17711 300 Virus_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 323
17576 300 Virus_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 323
15797 176 Cancer_Cell_Line_Encyclopedia https://portals.broadinstitute.org/ccle/home 967
12232 343 NCI-60_Cancer_Cell_Lines http://biogps.org/downloads/ 93
13572 301 Tissue_Protein_Expression_from_ProteomicsDB https://www.proteomicsdb.org/ 207
6454 301 Tissue_Protein_Expression_from_Human_Proteome_Map http://www.humanproteomemap.org/index.php 30
3723 47 HMDB_Metabolites http://www.hmdb.ca/downloads 3906
7588 35 Pfam_InterPro_Domains ftp://ftp.ebi.ac.uk/pub/databases/interpro/ 311
7682 78 GO_Biological_Process_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 941
7324 172 GO_Cellular_Component_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 205
8469 122 GO_Molecular_Function_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 402
13121 305 Allen_Brain_Atlas_up http://www.brain-map.org/ 2192
26382 1811 ENCODE_TF_ChIP-seq_2015 http://genome.ucsc.edu/ENCODE/downloads.html 816
29065 2123 ENCODE_Histone_Modifications_2015 http://genome.ucsc.edu/ENCODE/downloads.html 412
280 9 Phosphatase_Substrates_from_DEPOD http://www.koehn.embl.de/depod/ 59
13877 304 Allen_Brain_Atlas_down http://www.brain-map.org/ 2192
15852 912 ENCODE_Histone_Modifications_2013 http://genome.ucsc.edu/ENCODE/downloads.html 109
4320 129 Achilles_fitness_increase http://www.broadinstitute.org/achilles 216
4271 128 Achilles_fitness_decrease http://www.broadinstitute.org/achilles 216
10496 201 MGI_Mammalian_Phenotype_2013 http://www.informatics.jax.org/ 476
1678 21 BioCarta_2015 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 239
756 12 HumanCyc_2015 http://humancyc.org/ 125
3800 48 KEGG_2015 http://www.kegg.jp/kegg/download/ 179
2541 39 NCI-Nature_2015 http://pid.nci.nih.gov/ 209
1918 39 Panther_2015 http://www.pantherdb.org/ 104
5863 51 WikiPathways_2015 http://www.wikipathways.org/index.php/Download_Pathways 404
6768 47 Reactome_2015 http://www.reactome.org/download/index.html 1389
25651 807 ESCAPE http://www.maayanlab.net/ESCAPE/ 315
19129 1594 HomoloGene http://www.ncbi.nlm.nih.gov/homologene 12
23939 293 Disease_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 839
23561 307 Disease_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 839
23877 302 Drug_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 906
15886 9 Genes_Associated_with_NIH_Grants https://grants.nih.gov/grants/oer.htm 32876
24350 299 Drug_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 906
3102 25 KEA_2015 http://amp.pharm.mssm.edu/Enrichr 428
31132 298 Gene_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 2460
30832 302 Gene_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 2460
48230 1429 ChEA_2015 http://amp.pharm.mssm.edu/Enrichr 395
5613 36 dbGaP http://www.ncbi.nlm.nih.gov/gap 345
9559 73 LINCS_L1000_Chem_Pert_up https://clue.io/ 33132
9448 63 LINCS_L1000_Chem_Pert_down https://clue.io/ 33132
16725 1443 GTEx_Tissue_Sample_Gene_Expression_Profiles_down http://www.gtexportal.org/ 2918
19249 1443 GTEx_Tissue_Sample_Gene_Expression_Profiles_up http://www.gtexportal.org/ 2918
15090 282 Ligand_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 261
16129 292 Aging_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 286
15309 308 Aging_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 286
15103 318 Ligand_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 261
15022 290 MCF7_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 401
15676 310 MCF7_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 401
15854 279 Microbe_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 312
15015 321 Microbe_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 312
3788 159 LINCS_L1000_Ligand_Perturbations_down https://clue.io/ 96
3357 153 LINCS_L1000_Ligand_Perturbations_up https://clue.io/ 96
12668 300 L1000_Kinase_and_GPCR_Perturbations_down https://clue.io/ 3644
12638 300 L1000_Kinase_and_GPCR_Perturbations_up https://clue.io/ 3644
8973 64 Reactome_2016 http://www.reactome.org/download/index.html 1530
7010 87 KEGG_2016 http://www.kegg.jp/kegg/download/ 293
5966 51 WikiPathways_2016 http://www.wikipathways.org/index.php/Download_Pathways 437
15562 887 ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X 104
17850 300 Kinase_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 285
17660 300 Kinase_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 285
1348 19 BioCarta_2016 http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 237
934 13 HumanCyc_2016 http://humancyc.org/ 152
2541 39 NCI-Nature_2016 http://pid.nci.nih.gov/ 209
2041 42 Panther_2016 http://www.pantherdb.org/pathway/ 112
5209 300 DrugMatrix https://ntp.niehs.nih.gov/drugmatrix/ 7876
49238 1550 ChEA_2016 http://amp.pharm.mssm.edu/Enrichr 645
2243 19 huMAP http://proteincomplexes.org/ 995
19586 545 Jensen_TISSUES http://tissues.jensenlab.org/ 1842
22440 505 RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 1302
8184 24 MGI_Mammalian_Phenotype_2017 http://www.informatics.jax.org/ 5231
18329 161 Jensen_COMPARTMENTS http://compartments.jensenlab.org/ 2283
15755 28 Jensen_DISEASES http://diseases.jensenlab.org/ 1811
10271 22 BioPlex_2017 http://bioplex.hms.harvard.edu/ 3915
10427 38 GO_Cellular_Component_2017 http://www.geneontology.org/ 636
10601 25 GO_Molecular_Function_2017 http://www.geneontology.org/ 972
13822 21 GO_Biological_Process_2017 http://www.geneontology.org/ 3166
8002 143 GO_Cellular_Component_2017b http://www.geneontology.org/ 816
10089 45 GO_Molecular_Function_2017b http://www.geneontology.org/ 3271
13247 49 GO_Biological_Process_2017b http://www.geneontology.org/ 10125
21809 2316 ARCHS4_Tissues http://amp.pharm.mssm.edu/archs4 108
23601 2395 ARCHS4_Cell-lines http://amp.pharm.mssm.edu/archs4 125
20883 299 ARCHS4_IDG_Coexp http://amp.pharm.mssm.edu/archs4 352
19612 299 ARCHS4_Kinases_Coexp http://amp.pharm.mssm.edu/archs4 498
25983 299 ARCHS4_TFs_Coexp http://amp.pharm.mssm.edu/archs4 1724
19500 137 SysMyo_Muscle_Gene_Sets http://sys-myo.rhcloud.com/ 1135
14893 128 miRTarBase_2017 http://mirtarbase.mbc.nctu.edu.tw/ 3240
17598 1208 TargetScan_microRNA_2017 http://www.targetscan.org/ 683
5902 109 Enrichr_Libraries_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 121
12486 299 Enrichr_Submissions_TF-Gene_Coocurrence http://amp.pharm.mssm.edu/Enrichr 1722
1073 100 Data_Acquisition_Method_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 12
19513 117 DSigDB http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ 4026
14433 36 GO_Biological_Process_2018 http://www.geneontology.org/ 5103
8655 61 GO_Cellular_Component_2018 http://www.geneontology.org/ 446
11459 39 GO_Molecular_Function_2018 http://www.geneontology.org/ 1151
19741 270 TF_Perturbations_Followed_by_Expression http://www.ncbi.nlm.nih.gov/geo/ 1958
27360 802 Chromosome_Location_hg19 http://hgdownload.cse.ucsc.edu/downloads.html 36
13072 26 NIH_Funded_PIs_2017_Human_GeneRIF https://www.ncbi.nlm.nih.gov/pubmed/ 5687
13464 45 NIH_Funded_PIs_2017_Human_AutoRIF https://www.ncbi.nlm.nih.gov/pubmed/ 12558
13787 200 Rare_Diseases_AutoRIF_ARCHS4_Predictions https://amp.pharm.mssm.edu/geneshot/ 3725
13929 200 Rare_Diseases_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/gene/about-generif 2244
16964 200 NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 12558
17258 200 NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 5684
10352 58 Rare_Diseases_GeneRIF_Gene_Lists https://www.ncbi.nlm.nih.gov/gene/about-generif 2244
10471 76 Rare_Diseases_AutoRIF_Gene_Lists https://amp.pharm.mssm.edu/geneshot/ 3725
12419 491 SubCell_BarCode http://www.subcellbarcode.org/ 104
19378 37 GWAS_Catalog_2019 https://www.ebi.ac.uk/gwas 1737
6201 45 WikiPathways_2019_Human https://www.wikipathways.org/ 472
4558 54 WikiPathways_2019_Mouse https://www.wikipathways.org/ 176
3264 22 TRRUST_Transcription_Factors_2019 https://www.grnpedia.org/trrust/ 571
7802 92 KEGG_2019_Human https://www.kegg.jp/ 308
8551 98 KEGG_2019_Mouse https://www.kegg.jp/ 303
12444 23 InterPro_Domains_2019 https://www.ebi.ac.uk/interpro/ 1071
9000 20 Pfam_Domains_2019 https://pfam.xfam.org/ 608
7744 363 DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 https://depmap.org/ 558
6204 387 DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 https://depmap.org/ 325
13420 32 MGI_Mammalian_Phenotype_Level_4_2019 http://www.informatics.jax.org/ 5261
14148 122 UK_Biobank_GWAS_v1 https://www.ukbiobank.ac.uk/tag/gwas/ 857
9813 49 BioPlanet_2019 https://tripod.nih.gov/bioplanet/ 1510
1397 13 ClinVar_2019 https://www.ncbi.nlm.nih.gov/clinvar/ 182
9116 22 PheWeb_2019 http://pheweb.sph.umich.edu/ 1161
17464 63 DisGeNET https://www.disgenet.org 9828
394 73 HMS_LINCS_KinomeScan http://lincs.hms.harvard.edu/kinomescan/ 148
11851 586 CCLE_Proteomics_2020 https://portals.broadinstitute.org/ccle 378
8189 421 ProteomicsDB_2020 https://www.proteomicsdb.org/ 913
18704 100 lncHUB_lncRNA_Co-Expression https://amp.pharm.mssm.edu/lnchub/ 3729
5605 39 Virus-Host_PPI_P-HIPSTer_2020 http://phipster.org/ 6715
5718 31 Elsevier_Pathway_Collection http://www.transgene.ru/disease-pathways/ 1721
14156 40 Table_Mining_of_CRISPR_Studies 802
16979 295 COVID-19_Related_Gene_Sets https://amp.pharm.mssm.edu/covid19 205
4383 146 MSigDB_Hallmark_2020 https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp 50
54974 483 Enrichr_Users_Contributed_Lists_2020 https://maayanlab.cloud/Enrichr 1482
12118 448 TG_GATES_2020 https://toxico.nibiohn.go.jp/english/ 1190
12361 124 Allen_Brain_Atlas_10x_scRNA_2021 https://portal.brain-map.org/ 766
9763 139 Descartes_Cell_Types_and_Tissue_2021 https://descartes.brotmanbaty.org/bbi/human-gene-expression-during-development/ 172
8078 102 KEGG_2021_Human https://www.kegg.jp/ 320
7173 43 WikiPathway_2021_Human https://www.wikipathways.org/ 622
5833 100 HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression https://hubmapconsortium.github.io/ccf-asct-reporter/ 344
markers_enriched=list()
if (markers_found) {
  # Upregulated markers
  
  # Convert Seurat names of upregulated marker per cluster to Entrez; use named lists for translation
  # Is that still neccessary?
  marker_genesets_up = sapply(levels(sc$seurat_clusters), function(x) {
    tmp = markers_filt$up %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
    tmp = sapply(tmp, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
    return(tmp[!is.na(tmp)])
  }, USE.NAMES=TRUE, simplify=TRUE)
  
  # Tests done by Enrichr
  marker_genesets_up_enriched = purrr::map(marker_genesets_up, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  marker_genesets_up_enriched = purrr::map(list_names(marker_genesets_up_enriched), function(n) {
    return(purrr::map(marker_genesets_up_enriched[[n]], function(d){
      return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("up", nrow(d))))
    }))
  })

  # Write to files
  invisible(purrr::map(names(marker_genesets_up_enriched), function(n) {
    EnrichrWriteResults(enrichr_results=marker_genesets_up_enriched[[n]],
                        file=file.path(param$path_out, "marker_degs", paste0("functions_marker_up_cluster_vs_rest.xlsx", n, ".xlsx")))
  }))
  
  
  # Downregulated markers
  
  # Convert Seurat names of downregulated marker per cluster to Entrez; use named lists for translation
  # Is that still neccessary?
  marker_genesets_down = sapply(levels(sc$seurat_clusters), function(x) {
    tmp = markers_filt$down %>% dplyr::filter(cluster==x) %>% dplyr::pull(gene)
    tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
    return(tmp[!is.na(tmp)])
  }, USE.NAMES=TRUE, simplify=TRUE)
  
  #  Tests done by Enrichr
  marker_genesets_down_enriched = purrr::map(marker_genesets_down, EnrichrTest, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  marker_genesets_down_enriched = purrr::map(list_names(marker_genesets_down_enriched), function(n) {
    return(purrr::map(marker_genesets_down_enriched[[n]], function(d){
      return(cbind(d, Cluster=rep(n, nrow(d)), Direction=rep("down", nrow(d))))
    }))
  })
  
  # Write to files
  invisible(purrr::map(names(marker_genesets_down_enriched), function(n) {
    EnrichrWriteResults(enrichr_results=marker_genesets_down_enriched[[n]],
                        file=file.path(param$path_out, "marker_degs", paste0("functions_marker_down_cluster_vs_rest.xlsx", n, ".xlsx")))
  }))
  
  # Combine, flatten into data.frame and add to sc misc slot
  marker_genesets_enriched = c(marker_genesets_up_enriched, marker_genesets_down_enriched)
  marker_genesets_enriched = unname(marker_genesets_enriched)
  marker_genesets_enriched = purrr::map(marker_genesets_enriched, FlattenEnrichr) %>% dplyr::bind_rows()
  marker_genesets_enriched$Cluster = factor(marker_genesets_enriched$Cluster, levels=levels(sc$seurat_clusters))
  marker_genesets_enriched$Direction = factor(marker_genesets_enriched$Direction, levels=c("up", "down"))
  
  misc_content = Misc(sc, "markers")
  misc_content[["enrichr"]] = marker_genesets_enriched
  suppressWarnings({Misc(sc, "markers") = misc_content})
}

The following table contains the top enriched terms per cluster.

# Top enriched terms (TODO: better plots, functions)
if (markers_found) {
  cat('#### {.tabset} \n \n')
  
  # Get top ten up and down over all databases per cluster
  marker_genesets_top_enriched = marker_genesets_enriched %>% dplyr::group_by(Cluster, Direction) %>%
    dplyr::top_n(n=10, wt=Combined.Score)
  
  # Print as tabs
  for(n in levels(marker_genesets_top_enriched$Cluster)){
    cat('##### ', n, ' \n')
    
    print(knitr::kable(marker_genesets_top_enriched %>% dplyr::ungroup() %>% dplyr::filter(Cluster==n) %>% dplyr::select(Database, Term, Direction, Adjusted.P.value, Odds.Ratio, Combined.Score), 
                     align="l", caption="Top ten enriched terms per geneset", format="html") %>% 
    kableExtra::kable_styling(bootstrap_options=c("striped", "hover")) %>% 
    kableExtra::scroll_box(width="100%"))

    cat(' \n \n')
  }
  cat(' \n \n')
}

1
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Molecular_Function_2018 MHC class I protein binding (GO:0042288) up 0.0010954 302.6515 3102.378
GO_Molecular_Function_2018 T cell receptor binding (GO:0042608) up 0.0172453 333.0333 1847.767
GO_Molecular_Function_2018 phospholipase activator activity (GO:0016004) up 0.0172453 333.0333 1847.767
GO_Molecular_Function_2018 lipase activator activity (GO:0060229) up 0.0175993 277.5139 1497.033
GO_Biological_Process_2018 T cell activation (GO:0042110) up 0.0000301 105.3069 1603.984
GO_Biological_Process_2018 granzyme-mediated apoptotic signaling pathway (GO:0008626) up 0.0003310 908.3182 10945.738
GO_Biological_Process_2018 lymphocyte mediated immunity (GO:0002449) up 0.0009086 403.5960 4339.904
GO_Biological_Process_2018 regulation of lymphocyte apoptotic process (GO:0070228) up 0.0358453 333.0333 1847.767
GO_Biological_Process_2018 positive regulation of activation of Janus kinase activity (GO:0010536) up 0.0358453 333.0333 1847.767
GO_Biological_Process_2018 regulation of natural killer cell chemotaxis (GO:2000501) up 0.0358453 277.5139 1497.033
GO_Biological_Process_2018 positive regulation of macrophage chemotaxis (GO:0010759) up 0.0358453 277.5139 1497.033
GO_Biological_Process_2018 positive regulation of lymphocyte apoptotic process (GO:0070230) up 0.0358453 277.5139 1497.033
GO_Biological_Process_2018 regulation of cell-cell adhesion mediated by integrin (GO:0033632) up 0.0358453 277.5139 1497.033
GO_Biological_Process_2018 regulation of activation of Janus kinase activity (GO:0010533) up 0.0358453 277.5139 1497.033
GO_Biological_Process_2018 positive regulation of macrophage migration (GO:1905523) up 0.0358453 277.5139 1497.033
GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) up 0.0000000 739.8148 16543.208
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) down 0.0000903 184.2369 2488.768
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000903 149.6700 1940.172
GO_Molecular_Function_2018 MHC class II receptor activity (GO:0032395) down 0.0021679 191.9615 1800.445
GO_Biological_Process_2018 immunoglobulin mediated immune response (GO:0016064) down 0.0000135 479.2080 7573.227
GO_Biological_Process_2018 B cell mediated immunity (GO:0019724) down 0.0028780 255.9744 2521.841
GO_Biological_Process_2018 regulation of leukocyte activation (GO:0002694) down 0.0033273 219.3956 2106.522
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000002 332.7000 6524.138
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) down 0.0000003 237.5952 4394.580
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) down 0.0000012 132.9800 2188.505
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle membrane (GO:0030669) down 0.0000013 114.6149 1824.354
2
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Molecular_Function_2018 mRNA 3’-UTR AU-rich region binding (GO:0035925) up 0.0282789 399.68000 2284.211
GO_Biological_Process_2018 definitive hemopoiesis (GO:0060216) up 0.0162858 799.56000 5053.709
GO_Biological_Process_2018 cellular response to granulocyte macrophage colony-stimulating factor stimulus (GO:0097011) up 0.0162858 799.56000 5053.709
GO_Biological_Process_2018 response to granulocyte macrophage colony-stimulating factor (GO:0097012) up 0.0162858 799.56000 5053.709
GO_Biological_Process_2018 negative regulation of cell cycle phase transition (GO:1901988) up 0.0162858 799.56000 5053.709
GO_Biological_Process_2018 late endosome to vacuole transport (GO:0045324) up 0.0162858 571.05714 3445.290
GO_Biological_Process_2018 late endosome to vacuole transport via multivesicular body sorting pathway (GO:0032511) up 0.0162858 571.05714 3445.290
GO_Biological_Process_2018 negative regulation of stem cell differentiation (GO:2000737) up 0.0162858 444.11111 2580.412
GO_Biological_Process_2018 3’-UTR-mediated mRNA destabilization (GO:0061158) up 0.0162858 444.11111 2580.412
GO_Biological_Process_2018 negative regulation of protein localization to cell surface (GO:2000009) up 0.0162858 399.68000 2284.211
GO_Biological_Process_2018 T cell differentiation in thymus (GO:0033077) up 0.0162858 399.68000 2284.211
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000032 156.44706 2657.140
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) down 0.0001341 131.53187 1653.027
GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen (GO:0002478) down 0.0000000 59.54457 1508.806
GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen via MHC class II (GO:0019886) down 0.0000000 59.54457 1508.806
GO_Biological_Process_2018 granzyme-mediated apoptotic signaling pathway (GO:0008626) down 0.0018497 277.19444 2731.812
GO_Biological_Process_2018 immunoglobulin mediated immune response (GO:0016064) down 0.0027555 184.77778 1706.138
GO_Biological_Process_2018 B cell mediated immunity (GO:0019724) down 0.0027555 184.77778 1706.138
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000000 335.90909 8066.510
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) down 0.0000000 232.50583 5246.622
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) down 0.0000000 125.87121 2511.333
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle membrane (GO:0030669) down 0.0000001 107.86797 2078.043
3
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Molecular_Function_2018 1-phosphatidylinositol-3-kinase regulator activity (GO:0046935) up 0.0202173 499.6500 2955.690
GO_Molecular_Function_2018 neurotrophin TRKA receptor binding (GO:0005168) up 0.0202173 499.6500 2955.690
GO_Molecular_Function_2018 urea transmembrane transporter activity (GO:0015204) up 0.0202173 499.6500 2955.690
GO_Molecular_Function_2018 MAP kinase tyrosine/serine/threonine phosphatase activity (GO:0017017) up 0.0202173 416.3542 2398.853
GO_Molecular_Function_2018 neurotrophin TRK receptor binding (GO:0005167) up 0.0202173 356.8571 2008.476
GO_Molecular_Function_2018 amide transmembrane transporter activity (GO:0042887) up 0.0202173 356.8571 2008.476
GO_Molecular_Function_2018 phosphatidylinositol 3-kinase regulator activity (GO:0035014) up 0.0202173 356.8571 2008.476
GO_Molecular_Function_2018 cAMP response element binding protein binding (GO:0008140) up 0.0202173 356.8571 2008.476
GO_Biological_Process_2018 negative regulation of bone resorption (GO:0045779) up 0.0415146 499.6500 2955.690
GO_Biological_Process_2018 protein deubiquitination involved in ubiquitin-dependent protein catabolic process (GO:0071947) up 0.0415146 499.6500 2955.690
GO_Biological_Process_2018 regulation of toll-like receptor 3 signaling pathway (GO:0034139) up 0.0415146 416.3542 2398.853
GO_Biological_Process_2018 erythrocyte development (GO:0048821) up 0.0415146 416.3542 2398.853
GO_Biological_Process_2018 response to vitamin D (GO:0033280) up 0.0415146 356.8571 2008.476
GO_Biological_Process_2018 negative regulation of bone remodeling (GO:0046851) up 0.0415146 356.8571 2008.476
GO_Biological_Process_2018 regulation of toll-like receptor 2 signaling pathway (GO:0034135) up 0.0415146 356.8571 2008.476
GO_Biological_Process_2018 negative regulation of toll-like receptor 4 signaling pathway (GO:0034144) up 0.0415146 356.8571 2008.476
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) down 0.0000199 242.4899 3458.013
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000199 196.9934 2701.616
GO_Biological_Process_2018 positive regulation of viral entry into host cell (GO:0046598) down 0.0011145 332.8667 3442.909
GO_Biological_Process_2018 immunoglobulin mediated immune response (GO:0016064) down 0.0011145 332.8667 3442.909
GO_Biological_Process_2018 B cell mediated immunity (GO:0019724) down 0.0011145 332.8667 3442.909
GO_Biological_Process_2018 regulation of leukocyte activation (GO:0002694) down 0.0013018 285.3000 2879.407
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000000 652.5817 17590.767
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) down 0.0000000 451.6968 11520.332
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) down 0.0000000 244.5343 5595.746
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle membrane (GO:0030669) down 0.0000000 209.5588 4650.882
4
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) up 0.0002287 116.88867 1434.8216
GO_Molecular_Function_2018 arylesterase activity (GO:0004064) up 0.0497659 117.41176 535.4728
GO_Molecular_Function_2018 phospholipase activator activity (GO:0016004) up 0.0497659 117.41176 535.4728
GO_Molecular_Function_2018 MHC class I receptor activity (GO:0032393) up 0.0497659 117.41176 535.4728
GO_Biological_Process_2018 regulation of immune response (GO:0050776) up 0.0000001 28.21154 613.8195
GO_Biological_Process_2018 cellular defense response (GO:0006968) up 0.0004085 49.41191 633.4798
GO_Biological_Process_2018 granzyme-mediated apoptotic signaling pathway (GO:0008626) up 0.0036361 302.43939 3030.9316
GO_Biological_Process_2018 regulation of natural killer cell chemotaxis (GO:2000501) up 0.0037528 241.93939 2343.4834
GO_Biological_Process_2018 lymphocyte mediated immunity (GO:0002449) up 0.0054365 134.38384 1172.8783
GO_Biological_Process_2018 negative regulation by host of viral transcription (GO:0043922) up 0.0058650 120.93939 1033.6205
GO_Biological_Process_2018 positive regulation of lymphocyte migration (GO:2000403) up 0.0100160 80.60606 631.0721
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) down 0.0000150 307.21538 4575.5504
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000150 249.57500 3580.6467
GO_Molecular_Function_2018 MHC class II receptor activity (GO:0032395) down 0.0005827 312.09375 3208.6346
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000000 570.62857 12272.2207
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) down 0.0000000 407.51020 8309.6625
GO_Cellular_Component_2018 clathrin-coated vesicle membrane (GO:0030665) down 0.0000001 100.73887 1879.3951
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) down 0.0000001 228.08000 4184.8613
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle membrane (GO:0030669) down 0.0000002 196.58128 3500.4158
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle (GO:0045334) down 0.0000009 123.82609 1992.1383
GO_Cellular_Component_2018 ER to Golgi transport vesicle membrane (GO:0012507) down 0.0000011 113.89714 1796.5315
5
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) up 0.0000000 271.90909 7420.5948
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) up 0.0000000 209.12937 5453.2743
GO_Molecular_Function_2018 MHC class II receptor activity (GO:0032395) up 0.0000003 289.04348 5420.3081
GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen (GO:0002478) up 0.0000000 49.54490 1330.8282
GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen via MHC class II (GO:0019886) up 0.0000000 49.54490 1330.8282
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) up 0.0000000 875.63415 41237.4881
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) up 0.0000000 486.36585 21357.1962
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) up 0.0000000 147.45877 4152.4107
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle membrane (GO:0030669) up 0.0000000 124.74776 3388.2005
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle (GO:0045334) up 0.0000000 75.36452 1811.7972
GO_Cellular_Component_2018 ER to Golgi transport vesicle membrane (GO:0012507) up 0.0000000 68.93667 1618.3072
GO_Molecular_Function_2018 T cell receptor binding (GO:0042608) down 0.0007908 179.12613 2247.1521
GO_Biological_Process_2018 T cell activation (GO:0042110) down 0.0000000 30.66563 920.1930
GO_Biological_Process_2018 leukocyte cell-cell adhesion (GO:0007159) down 0.0000643 41.41785 609.9258
GO_Biological_Process_2018 granzyme-mediated apoptotic signaling pathway (GO:0008626) down 0.0003254 179.12613 2247.1521
GO_Biological_Process_2018 immunological synapse formation (GO:0001771) down 0.0003254 179.12613 2247.1521
GO_Biological_Process_2018 regulation of protein localization to early endosome (GO:1902965) down 0.0121739 88.75893 679.0298
GO_Biological_Process_2018 gland morphogenesis (GO:0022612) down 0.0121739 88.75893 679.0298
GO_Biological_Process_2018 positive regulation of protein localization to endosome (GO:1905668) down 0.0121739 88.75893 679.0298
GO_Biological_Process_2018 positive regulation of protein localization to early endosome (GO:1902966) down 0.0121739 88.75893 679.0298
GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) down 0.0000000 187.52830 6031.6093
6
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Molecular_Function_2018 Toll-like receptor binding (GO:0035325) up 0.0013272 59.32331 739.1403
GO_Biological_Process_2018 neutrophil activation involved in immune response (GO:0002283) up 0.0000000 18.21456 2414.9447
GO_Biological_Process_2018 neutrophil degranulation (GO:0043312) up 0.0000000 18.01524 2347.6523
GO_Biological_Process_2018 neutrophil mediated immunity (GO:0002446) up 0.0000000 17.65897 2279.1587
GO_Biological_Process_2018 regulation of leukocyte degranulation (GO:0043300) up 0.0002384 59.32331 739.1403
GO_Biological_Process_2018 negative regulation of erythrocyte differentiation (GO:0045647) up 0.0013701 73.88390 735.9340
GO_Biological_Process_2018 cellular response to bacterial lipopeptide (GO:0071221) up 0.0013701 73.88390 735.9340
GO_Cellular_Component_2018 tertiary granule (GO:0070820) up 0.0000000 17.46596 942.6245
GO_Cellular_Component_2018 ficolin-1-rich granule (GO:0101002) up 0.0000000 15.88961 847.2847
GO_Cellular_Component_2018 secondary lysosome (GO:0005767) up 0.0000045 46.51415 674.0303
GO_Biological_Process_2018 SRP-dependent cotranslational protein targeting to membrane (GO:0006614) down 0.0000000 198.33000 29611.4448
GO_Biological_Process_2018 cotranslational protein targeting to membrane (GO:0006613) down 0.0000000 183.60185 27007.1335
GO_Biological_Process_2018 protein targeting to ER (GO:0045047) down 0.0000000 170.90517 24783.1573
GO_Biological_Process_2018 viral gene expression (GO:0019080) down 0.0000000 139.52113 19384.6305
GO_Biological_Process_2018 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) down 0.0000000 135.68493 18735.8540
GO_Biological_Process_2018 viral transcription (GO:0019083) down 0.0000000 133.84459 18425.5796
GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) down 0.0000000 122.44280 16911.3963
GO_Cellular_Component_2018 cytosolic part (GO:0044445) down 0.0000000 86.27742 10921.3226
GO_Cellular_Component_2018 cytosolic small ribosomal subunit (GO:0022627) down 0.0000000 141.15891 10403.8701
GO_Cellular_Component_2018 small ribosomal subunit (GO:0015935) down 0.0000000 124.02374 8899.1646
7
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Biological_Process_2018 response to interleukin-4 (GO:0070670) up 0.0022988 439.07692 4776.8888
GO_Biological_Process_2018 cellular response to interleukin-4 (GO:0071353) up 0.0022988 439.07692 4776.8888
GO_Biological_Process_2018 negative regulation of phospholipid metabolic process (GO:1903726) up 0.0377162 285.42857 1542.8676
GO_Biological_Process_2018 positive regulation of dendritic cell antigen processing and presentation (GO:0002606) up 0.0377162 285.42857 1542.8676
GO_Biological_Process_2018 immunological synapse formation (GO:0001771) up 0.0377162 285.42857 1542.8676
GO_Biological_Process_2018 positive regulation of granulocyte differentiation (GO:0030854) up 0.0377162 285.42857 1542.8676
GO_Biological_Process_2018 positive regulation of humoral immune response (GO:0002922) up 0.0377162 285.42857 1542.8676
GO_Biological_Process_2018 alpha-beta T cell activation (GO:0046631) up 0.0377162 285.42857 1542.8676
GO_Biological_Process_2018 positive regulation of antigen processing and presentation (GO:0002579) up 0.0377162 285.42857 1542.8676
GO_Biological_Process_2018 regulation of dendritic cell chemotaxis (GO:2000508) up 0.0377162 285.42857 1542.8676
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) down 0.0000851 67.17975 926.8522
GO_Molecular_Function_2018 phospholipase inhibitor activity (GO:0004859) down 0.0002116 149.34000 1863.7702
GO_Biological_Process_2018 platelet degranulation (GO:0002576) down 0.0000000 26.77532 705.0272
GO_Biological_Process_2018 granzyme-mediated apoptotic signaling pathway (GO:0008626) down 0.0001566 248.92500 3361.4016
GO_Biological_Process_2018 positive regulation of vesicle fusion (GO:0031340) down 0.0069214 122.91975 1018.2868
GO_Biological_Process_2018 membrane raft assembly (GO:0001765) down 0.0069214 122.91975 1018.2868
GO_Biological_Process_2018 granulocyte migration (GO:0097530) down 0.0088907 98.33086 781.7682
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) down 0.0000001 141.79487 2832.0750
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) down 0.0000003 98.14596 1818.8552
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) down 0.0000032 53.13301 846.8796
8
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Biological_Process_2018 neutrophil degranulation (GO:0043312) up 0.0000000 15.12225 1228.8822
GO_Biological_Process_2018 neutrophil activation involved in immune response (GO:0002283) up 0.0000000 14.97978 1211.3135
GO_Biological_Process_2018 neutrophil mediated immunity (GO:0002446) up 0.0000000 14.83992 1194.1236
GO_Biological_Process_2018 regulation of mast cell degranulation (GO:0043304) up 0.0000333 52.09211 795.7486
GO_Biological_Process_2018 cellular response to thyroid hormone stimulus (GO:0097067) up 0.0001482 69.10646 915.7695
GO_Biological_Process_2018 response to thyroid hormone (GO:0097066) up 0.0001827 59.23111 758.5874
GO_Biological_Process_2018 defense response to protozoan (GO:0042832) up 0.0016821 61.87500 613.6493
GO_Cellular_Component_2018 ficolin-1-rich granule (GO:0101002) up 0.0000000 16.31561 689.7468
GO_Cellular_Component_2018 ficolin-1-rich granule lumen (GO:1904813) up 0.0000000 19.07990 689.1757
GO_Cellular_Component_2018 tertiary granule lumen (GO:1904724) up 0.0000000 23.73574 541.6109
GO_Biological_Process_2018 SRP-dependent cotranslational protein targeting to membrane (GO:0006614) down 0.0000000 85.69717 6200.1546
GO_Biological_Process_2018 cotranslational protein targeting to membrane (GO:0006613) down 0.0000000 80.85285 5761.8867
GO_Biological_Process_2018 protein targeting to ER (GO:0045047) down 0.0000000 76.52526 5374.4356
GO_Biological_Process_2018 viral gene expression (GO:0019080) down 0.0000000 65.17763 4378.8516
GO_Biological_Process_2018 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) down 0.0000000 63.72281 4253.5646
GO_Biological_Process_2018 viral transcription (GO:0019083) down 0.0000000 63.01938 4193.1941
GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) down 0.0000000 56.19195 3614.7041
GO_Cellular_Component_2018 cytosolic small ribosomal subunit (GO:0022627) down 0.0000000 84.39804 3698.1952
GO_Cellular_Component_2018 small ribosomal subunit (GO:0015935) down 0.0000000 75.94294 3240.4742
GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) down 0.0000000 221.04444 7385.0970
9
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Biological_Process_2018 platelet degranulation (GO:0002576) up 0.0000000 17.36236 780.2126
GO_Biological_Process_2018 regulated exocytosis (GO:0045055) up 0.0000000 13.98476 567.7365
GO_Biological_Process_2018 platelet aggregation (GO:0070527) up 0.0000000 39.74558 1248.4514
GO_Biological_Process_2018 homotypic cell-cell adhesion (GO:0034109) up 0.0000000 32.09405 944.9956
GO_Biological_Process_2018 polyamine metabolic process (GO:0006595) up 0.0001128 48.51724 704.7961
GO_Biological_Process_2018 polyamine biosynthetic process (GO:0006596) up 0.0043815 67.47260 654.2758
GO_Biological_Process_2018 positive regulation of fibroblast migration (GO:0010763) up 0.0043815 67.47260 654.2758
GO_Biological_Process_2018 regulation of blood vessel remodeling (GO:0060312) up 0.0043815 67.47260 654.2758
GO_Cellular_Component_2018 platelet alpha granule (GO:0031091) up 0.0000000 17.71931 612.9590
GO_Cellular_Component_2018 autolysosome (GO:0044754) up 0.0000399 67.70103 858.7579
GO_Biological_Process_2018 SRP-dependent cotranslational protein targeting to membrane (GO:0006614) down 0.0000000 292.66937 74659.7352
GO_Biological_Process_2018 cotranslational protein targeting to membrane (GO:0006613) down 0.0000000 233.36216 59018.9233
GO_Biological_Process_2018 protein targeting to ER (GO:0045047) down 0.0000000 194.99812 48980.2171
GO_Biological_Process_2018 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) down 0.0000000 103.51279 23785.9600
GO_Biological_Process_2018 viral gene expression (GO:0019080) down 0.0000000 91.76048 19624.0570
GO_Biological_Process_2018 viral transcription (GO:0019083) down 0.0000000 84.30741 17748.9609
GO_Biological_Process_2018 cytoplasmic translation (GO:0002181) down 0.0000000 120.67807 14662.6459
GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) down 0.0000000 78.11807 17253.7494
GO_Cellular_Component_2018 large ribosomal subunit (GO:0015934) down 0.0000000 82.78017 11266.2469
GO_Cellular_Component_2018 cytosolic large ribosomal subunit (GO:0022625) down 0.0000000 88.64557 11970.1895
10
Top ten enriched terms per geneset
Database Term Direction Adjusted.P.value Odds.Ratio Combined.Score
GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) up 0.0000054 79.93162 1392.3565
GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) up 0.0000071 62.79393 1033.2344
GO_Molecular_Function_2018 MHC class II receptor activity (GO:0032395) up 0.0000155 116.23392 1774.0424
GO_Biological_Process_2018 positive regulation of interleukin-2 biosynthetic process (GO:0045086) up 0.0007927 103.70609 1184.3418
GO_Biological_Process_2018 positive regulation of viral entry into host cell (GO:0046598) up 0.0007927 103.70609 1184.3418
GO_Biological_Process_2018 immunoglobulin mediated immune response (GO:0016064) up 0.0007927 103.70609 1184.3418
GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) up 0.0000000 460.13889 20627.8882
GO_Cellular_Component_2018 MHC protein complex (GO:0042611) up 0.0000000 230.02315 9447.4110
GO_Cellular_Component_2018 integral component of lumenal side of endoplasmic reticulum membrane (GO:0071556) up 0.0000000 68.78268 1797.5995
GO_Cellular_Component_2018 clathrin-coated endocytic vesicle membrane (GO:0030669) up 0.0000000 57.76582 1442.9954
GO_Molecular_Function_2018 T cell receptor binding (GO:0042608) down 0.0000002 664.46667 13535.5705
GO_Molecular_Function_2018 MHC class I protein binding (GO:0042288) down 0.0008597 89.08346 1016.3754
GO_Molecular_Function_2018 RAGE receptor binding (GO:0050786) down 0.0110879 91.83871 728.8221
GO_Molecular_Function_2018 Toll-like receptor binding (GO:0035325) down 0.0110879 91.83871 728.8221
GO_Biological_Process_2018 T cell activation (GO:0042110) down 0.0000000 41.13072 1045.2915
GO_Biological_Process_2018 T cell differentiation (GO:0030217) down 0.0000236 51.11197 809.4938
GO_Biological_Process_2018 immunological synapse formation (GO:0001771) down 0.0059399 160.74194 1415.3597
GO_Biological_Process_2018 nucleus localization (GO:0051647) down 0.0059399 160.74194 1415.3597
GO_Biological_Process_2018 leukocyte aggregation (GO:0070486) down 0.0074689 128.58710 1089.2304
GO_Biological_Process_2018 cytoskeletal anchoring at nuclear membrane (GO:0090286) down 0.0090345 107.15054 877.0424
GO_Biological_Process_2018 negative regulation of T cell apoptotic process (GO:0070233) down 0.0098085 91.83871 728.8221
GO_Biological_Process_2018 nuclear migration (GO:0007097) down 0.0098085 91.83871 728.8221
GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) down 0.0000000 355.85714 13155.6972

Differentially expressed genes

If requested, we identify genes that are differentially expressed between two groups of cells. Groups can be defined by columns in the cell metadata. Different types of tests can be used and input data for testing can be the different assays as well as the computed dimensionality reductions. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.

# We first compute the DEGs for all requested contrasts

# Prepare a list with contrasts (input can be R data.frame table or Excel file)
degs_contrasts_list = DegsSetupContrastsList(sc, param$deg_contrasts, param$latent_vars)

# Add the actual data to the list
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){
  # If there were already errors, just return
  if (length(contrast[["error_messages"]]) > 0) return(c(contrast, list(object=NULL, cells_group1_idx_subset=as.integer(), cells_group2_idx_subset=as.integer())))
  
  # Get cells indices
  cells_group1_idx = contrast[["cells_group1_idx"]]
  cells_group2_idx = contrast[["cells_group2_idx"]]

  # Create object
  if (contrast[["use_reduction"]]) {
    # Use dimensionality reduction
    contrast[["object"]] = Seurat::Reductions(sc, slot=contrast[["assay"]])
  } else {
    # Use assay
    contrast[["object"]] = Seurat::GetAssay(sc[,unique(c(cells_group1_idx, cells_group2_idx))], assay=contrast[["assay"]])
    
    # This saves a lot of memory for parallelisation
    if (contrast[["slot"]]!="scale.data") contrast[["object"]]@scale.data = new(Class='matrix')
  }
  
  # Variable latent vars must be passed as data.frame
  if (!is.null(contrast[["latent_vars"]]) && length(contrast[["latent_vars"]]) > 0) {
    contrast[["latent_vars"]] = sc[[unique(c(cells_group1_idx, cells_group2_idx)), contrast[["latent_vars"]], drop=FALSE]]
  }

  # Now update cell indices so that they match to subset
  contrast[["cells_group1_idx_subset"]] = match(colnames(sc)[cells_group1_idx], colnames(contrast[["object"]]))
  contrast[["cells_group2_idx_subset"]] = match(colnames(sc)[cells_group2_idx], colnames(contrast[["object"]]))
  
  return(contrast)
})

# Compute the tests
# TODO: this chunk may be done in parallel in future
degs_contrasts_results = purrr::map(degs_contrasts_list, function(contrast) {
  if (length(contrast$error_messages)==0) {
    # No errors, do contrast
    test_results = suppressWarnings(
      DegsTestCellSets(object=contrast[["object"]],
                       slot=contrast[["slot"]],
                       cells_1=colnames(contrast[["object"]])[contrast[["cells_group1_idx_subset"]]],
                       cells_2=colnames(contrast[["object"]])[contrast[["cells_group2_idx_subset"]]],
                       is_reduction=contrast[["use_reduction"]],
                       logfc.threshold=contrast[["log2FC"]],
                       test.use=contrast[["test"]],
                       min.pct=contrast[["min_pct"]],
                       latent.vars=contrast[["latent_vars"]])
    )
  } else {
    # Errors, return empty data.frame
    test_results = DegsEmptyResultsTable()
  }
  
  # Sort and filter table
  test_results = test_results %>% DegsSort() %>% DegsFilter(contrast[["log2FC"]], contrast[["padj"]], split_by_dir=FALSE)

  # Add mean gene expression data (counts or data, dep on slot)
  avg.1 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group1_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
  avg.2 = DegsAvgData(contrast[["object"]], cells=contrast[["cells_group2_idx_subset"]], genes=test_results$gene, slot=contrast[["slot"]])[,1]
  test_results = cbind(test_results, avg.1, avg.2)
  
  # Add test results and drop unneccessary data
  contrast = c(contrast, list(results=test_results))
  contrast[["object"]] = NULL
  contrast[["cells_group1_idx_subset"]] = NULL
  contrast[["cells_group2_idx_subset"]] = NULL
  
  return(contrast)
})

# Also remove objects from deg_contrasts_list (save memory)
degs_contrasts_list = purrr::map(degs_contrasts_list, function(contrast){ contrast[["object"]] = NULL; return(contrast)})
# Use the existing variable and add Enrichr results
# Not in parallel due to server load
degs_contrasts_results = purrr::map(degs_contrasts_results, function(contrast) {
  # Get results table
  results_table = contrast$results
  
  # Drop existing results
  if ("enrichr" %in% names(contrast)) contrast[["enrichr"]] = NULL
  
  # Split into up- and downregulated DEGs, then translate to Entrez gene, runEnrichr
  degs_up = dplyr::filter(results_table, avg_log2FC > 0) %>% dplyr::pull(gene) %>% unique()
  degs_up = sapply(degs_up, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
  degs_up = degs_up[!is.na(degs_up)]
  enrichr_results_up = EnrichrTest(genes=degs_up, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  
  degs_down = dplyr::filter(results_table, avg_log2FC < 0) %>% dplyr::pull(gene) %>% unique()
  degs_down = sapply(degs_down, function(n) seurat_rowname_to_entrez[[n]][1], USE.NAMES=TRUE, simplify=TRUE) %>% unlist() %>% as.character()
  degs_down = degs_down[!is.na(degs_down)]
  enrichr_results_down = EnrichrTest(genes=degs_down, databases=param$enrichr_dbs, padj=param$enrichr_padj)
  
  # Flatten both enrichr results into tables
  enrichr_results_up = purrr::map_dfr(names(enrichr_results_up), function(n) {
    return(cbind(enrichr_results_up[[n]], 
          list(Database=factor(rep(n, nrow(enrichr_results_up[[n]])), levels=names(enrichr_results_up)), 
               Direction=factor(rep("up", nrow(enrichr_results_up[[n]])), levels=c("up", "down"))
               )
          ))
  })
  
  enrichr_results_down = purrr::map_dfr(names(enrichr_results_down), function(n) {
    return(cbind(enrichr_results_down[[n]], 
          list(Database=factor(rep(n, nrow(enrichr_results_down[[n]])), levels=names(enrichr_results_down)), 
               Direction=factor(rep("up", nrow(enrichr_results_down[[n]])), levels=c("up", "down"))
               )
          ))
  })
  
  # Rbind and add factor levels
  enrichr_results = rbind(enrichr_results_up, enrichr_results_down)
  return(c(contrast, list(enrichr=enrichr_results)))
})
# Now regroup list so that subsets are together again
original_contrast_rows = purrr::map_int(degs_contrasts_results, function(contrast){ return(contrast[["contrast_row"]]) })
degs = split(degs_contrasts_results, original_contrast_rows)

# Write degs to files
invisible(purrr::map_chr(degs, function(degs_subsets) {
  # The output file
  file = file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]], "_results.xlsx"))
  
  # Write degs
  degs_subsets_results = purrr::map(degs_subsets, function(contrast) {return(contrast[["results"]])})
  names(degs_subsets_results) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
  file = DegsWriteToFile(degs_subsets_results, 
                         annot_ensembl=annot_ensembl,
                         gene_to_ensembl=seurat_rowname_to_ensembl,
                         file=file,
                         additional_readme=NULL)
  
  return(file)
}))


invisible(purrr::map_chr(degs, function(degs_subsets) {
  # The output file
  file = file.path(param$path_out, "marker_degs", paste0("degs_contrast_row_", degs_subsets[[1]][["contrast_row"]],  "_functions.xlsx"))
  
  # Write Enrichr results
  degs_subsets_enrichr = purrr::map(degs_subsets, function(contrast) {return(contrast[["enrichr"]])})
  names(degs_subsets_enrichr) = purrr::map_chr(degs_subsets, function(contrast) {return(ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All"))})
  file = EnrichrWriteResults(degs_subsets_enrichr, file=file)
  
  return(file)
}))

# Add to sc object
Misc(sc, "degs") = degs
for (i in seq(degs)) { 
  for (j in seq(degs[[i]])) {

    # Average expression of all genes
    x = subset(sc, cells=degs[[i]][[j]]$cells_group2_idx) %>% GetAssayData(slot="data")
    x = log2(Matrix::rowMeans(expm1(x)) + 1)
    y = subset(sc, cells=degs[[i]][[j]]$cells_group1_idx) %>% GetAssayData(slot="data") 
    y = log2(Matrix::rowMeans(expm1(y)) + 1)
    sc_avg_log2FC = data.frame(x, y, col="none", gene=rownames(sc))
    lims = c(min(c(x, y)), max(x, y))
    
    ## Color DEGs
    up = degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC > 0) %>% dplyr::pull(gene)
    if (length(up) > 0) sc_avg_log2FC[up, "col"] = "up"
    down = degs[[i]][[j]]$results %>% dplyr::filter(avg_log2FC < 0) %>% dplyr::pull(gene)
    if (length(down) > 0) sc_avg_log2FC[down, "col"] = "down"

    ## Plots
    degs[[i]][[j]]$plot_scatter = ggplot(sc_avg_log2FC %>% dplyr::arrange(col, gene), aes(x=x, y=y, col=col)) + 
      geom_abline(slope=1, intercept=0, col="lightgrey") + 
      geom_abline(slope=1, intercept=c(-degs[[i]][[j]]$log2FC, degs[[i]][[j]]$log2FC), col="lightgrey", lty=2) + 
      geom_point() + 
      xlim(lims) + ylim(lims) +
      AddStyle(ylab=degs[[i]][[j]]$condition_group1, xlab=degs[[i]][[j]]$condition_group2, 
               col=c(none="grey", up="darkgoldenrod1", down="steelblue"), 
               legend_position="bottom", legend_title="Filtered genes")

    # Feature plot of top 4 genes, sorted by the p-value
    degs_top = degs[[i]][[j]]$results %>% dplyr::top_n(n=-4, wt=p_val) %>% dplyr::top_n(n=-4, wt=avg_log2FC) %>% dplyr::pull(gene)
    if (length(degs_top) > 0) {
      p_list = Seurat::FeaturePlot(sc, features=degs_top, cols=c("lightgrey", param$col), combine=FALSE, label=TRUE)
      for (p in seq(p_list)) p_list[[p]] = p_list[[p]] + AddStyle(legend_position="bottom", xlab="", ylab="")
      degs[[i]][[j]]$plot_feature = patchwork::wrap_plots(p_list, ncol=ifelse(length(degs_top) > 1, 2, 1))
    } else {
      degs[[i]][[j]]$plot_feature = NULL
    }
  }
}
knitr_header_string = '

## {{condition_column}}: {{condition_group1}} vs {{condition_group2}}

General configuration:

* assay: {{assay}}
* slot: {{slot}}
* test: {{test}}
* maximum adjusted p-value: {{padj}}
* minimum absolute log2 foldchange: {{log2FC}}
* minimum percentage of cells: {{min_pct}}
* latent vars: {{latent_vars}}

Subset on column: \'{{subset_column}}\''

if (length(degs)==0) message("No DEG contrasts specified.")

for (i in seq(degs)) {
  # Get subsets
  degs_subsets = degs[[i]]
  first_contrast = degs_subsets[[1]]
  
  # Create header
  cat(
    knitr::knit_expand(text=knitr_header_string,
                     condition_column=first_contrast[["condition_column"]],
                     condition_group1=first_contrast[["condition_group1"]],
                     condition_group2=first_contrast[["condition_group2"]],
                     assay=first_contrast[["assay"]],
                     slot=first_contrast[["slot"]],
                     test=first_contrast[["test"]],
                     padj=first_contrast[["padj"]],
                     log2FC=first_contrast[["log2FC"]],
                     min_pct=first_contrast[["min_pct"]],
                     latent_vars=ifelse(!is.null(first_contrast[["latent_vars"]]), paste(colnames(first_contrast[["latent_vars"]]), collapse=","), "-"),
                     subset_column=ifelse(is.na(first_contrast[["subset_column"]]), "-", first_contrast[["subset_column"]]))
  , '\n')
  
  # Get error messages
  error_messages = unique(purrr::flatten_chr(purrr::map(degs_subsets, function(contrast){return(contrast[["error_messages"]])})))

   # Create combined results table
  degs_subsets_results = purrr::map_dfr(degs_subsets, function(contrast) {
    subset_group_value = ifelse(!is.na(contrast[["subset_group"]]), contrast[["subset_group"]], "All")
    return(contrast[["results"]] %>% 
      dplyr::summarise(subset_group=subset_group_value,
                       Cells1=length(contrast[["cells_group1_idx"]]),
                       Cells2=length(contrast[["cells_group2_idx"]]),
                       DEGs=length(gene),
                       DEGs_up=sum(avg_log2FC > 0),
                       DEGs_down=sum(avg_log2FC < 0)))
  }) %>% tibble::as_tibble()
  
  # Print warnings/errors
  if (length(error_messages) > 0) {
    warning(error_messages)
  }
  
  # Print summary table
  print(
      knitr::kable(degs_subsets_results,
                   align="l", caption="DEG summary", 
                   col.names=c("Subset", "Cells in group 1", "Cells in group 2", "# DEGs", "# DEGs upregulated", "# DEGs downregulated"), 
                   format="html") %>%
        kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
    )
  
  # Print plots per contrast
  for (contrast in seq(degs_subsets)) {
    p = degs_subsets[[contrast]]$plot_scatter | degs_subsets[[contrast]]$plot_feature
    title = "Scatterplot and feature plots"
    if (!is.na(degs_subsets[[contrast]]$subset_column)) {
      title = paste0(title, " (subset ", degs_subsets[[contrast]]$subset_column, 
                     ": ", degs_subsets[[contrast]]$subset_group, ")")
    }
    
    p = p + patchwork::plot_annotation(title=title)
    print(p)
  }
  
  cat('\n \n')
} 

orig.ident: pbmc_10x vs pbmc_smartseq2_sample1

General configuration:

  • assay: RNA
  • slot: data
  • test: wilcox
  • maximum adjusted p-value: 0.05
  • minimum absolute log2 foldchange: 0.584962500721156
  • minimum percentage of cells: 0.1
  • latent vars: -
Subset on column: ‘-’
DEG summary
Subset Cells in group 1 Cells in group 2 # DEGs # DEGs upregulated # DEGs downregulated
All 3608 310 970 327 643

orig.ident: pbmc_10x vs pbmc_smartseq2_sample1

General configuration:

  • assay: RNA
  • slot: data
  • test: wilcox
  • maximum adjusted p-value: 0.05
  • minimum absolute log2 foldchange: 0.584962500721156
  • minimum percentage of cells: 0.1
  • latent vars: -
Subset on column: ‘seurat_clusters’
DEG summary
Subset Cells in group 1 Cells in group 2 # DEGs # DEGs upregulated # DEGs downregulated
1 50 50 215 131 84
2 50 35 181 117 64
3 50 42 189 124 65
4 50 50 218 137 81
5 50 24 156 90 66
6 50 43 341 192 149
7 50 19 141 90 51
8 50 8 24 0 24
9 49 16 18 6 12
10 39 6 13 0 13

Phase: G1 vs G2M

General configuration:

  • assay: RNA
  • slot: data
  • test: wilcox
  • maximum adjusted p-value: 0.05
  • minimum absolute log2 foldchange: 0.584962500721156
  • minimum percentage of cells: 0.1
  • latent vars: -
Subset on column: ‘seurat_clusters’
DEG summary
Subset Cells in group 1 Cells in group 2 # DEGs # DEGs upregulated # DEGs downregulated
1 30 30 0 0 0
2 30 30 0 0 0

Output

# Add colour lists for orig.dataset
col = GenerateColours(num_colours=length(levels(sc$orig.dataset)), names=levels(sc$orig.dataset), palette=param$col_palette_samples, alphas=1)
sc = ScAddLists(sc, lists=list(orig.dataset=col), lists_slot="colour_lists")

# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))

# Add parameter
Seurat::Misc(sc, "parameters") = param

# Add technical output
session_info = sessioninfo::session_info()

× (Message)
Registered S3 method overwritten by ‘cli’:
method from

print.boxx spatstat.geom

Seurat::Misc(sc, "technical") = list(scrnaseq_git=GitRepositoryVersion(param$path_to_git),
                                     R=session_info$platform$version,
                                     packages=as.data.frame(session_info$packages)[, c("package", "ondiskversion", "loadedversion", "date")])

Export to Loupe Cell Browser

If all provided datasets are of type “10x,” we export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (“What Is Loupe Browser? -Software -Single Cell Gene Expression -Official 10x Genomics Support” 2021).

if (all(param$path_data$type == "10x")) { 
  
  # Export reductions (umap, pca, others)
  for(r in Seurat::Reductions(sc)) {
    write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
                file=file.path(param$path_out, "export", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
  }
  
  # Export categorical metadata
  loupe_meta = sc@meta.data
  idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
  write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"), 
              file=file.path(param$path_out, "export", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

  # Export gene sets (CC genes, known markers, per-cluster markers up- and downregulated, ...)
  gene_lists = Misc(sc, "gene_lists")
  loupe_genesets = purrr::map_dfr(names(gene_lists), function(n) {return(data.frame(List=n, Name=gene_lists[[n]]))})
  loupe_genesets$Ensembl = seurat_rowname_to_ensembl[loupe_genesets$Name]
  write.table(loupe_genesets, file=file.path(param$path_out, "export", "Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}

Result files are: * export/Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… visualization
Import to Loupe through “Import Projection”
* export/Loupe_metadata.csv: Seurat categorial meta data including clusters and cell cycle phases
Import to Loupe through “Import Categories”
* export/Loupe_genesets.csv: Gene sets such as markers, DEGs, cell cycles
Import to Loupe through “Import Lists”

Export to cellxgene browser

We export the assay data, cell metadata, clustering and visualisation in a format that can be read by the cellxgene browser (chanzuckerberg 2021).

# Convert Seurat single cell object to python anndata object which will be accessible via reticulate here
adata = sceasy::convertFormat(sc, from="seurat", to="anndata", outFile=NULL, assay=DefaultAssay(sc))

# Set up correct colours (see https://chanzuckerberg.github.io/cellxgene/posts/prepare) so that colours match
adata$uns = dict(
  seurat_clusters_colors=np_array(unname(Misc(sc, "colour_lists")$seurat_clusters), dtype='<U7'), 
  orig.ident_colors=np_array(unname(Misc(sc, "colour_lists")$orig.ident), dtype='<U7'),
  orig.dataset_colors=np_array(unname(Misc(sc, "colour_lists")$orig.dataset), dtype='<U7'),
  Phase_colors=np_array(unname(Misc(sc, "colour_lists")$Phase), dtype='<U7')
)

# Write to h5ad file
adata$write(file.path(param$path_out, "export", "sc.h5ad"), compression='gzip')

Result files are: * export/sc.h5ad: H5AD object for cellxgene browser
Copy/upload to data directory of your cellxgene browser

Export to the Cerebro browser

We export the assay data, clustering, visualisation, marker genes, enriched pathways and degs in a format that can be read by the Cerebro Browser (romanhaa 2021).

# Export to cerebro
res = ExportToCerebro(sc=sc, 
                      path=file.path(param$path_out, "export", "sc.crb"), 
                      assay=DefaultAssay(sc),
                      delayed_array=FALSE)

Result files are: * export/sc.crb: Object for Cerebro browser
Load into Cerebro browser

Other output files

# Seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))

# Counts (RNA)
invisible(ExportSeuratAssayData(sc, 
                      dir=file.path(param$path_out, "data", "counts"), 
                      assays="RNA", 
                      slot="counts",
                      include_cell_metadata_cols=colnames(sc[[]]),
                      metadata_prefix=paste0(param$project_id, ".")))

# Metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), row.names=FALSE, col.names=TRUE)

# Annotation as excel file
openxlsx::write.xlsx(x=data.frame(seurat_id=rownames(sc), ensembl_gene_id=seurat_rowname_to_ensembl[rownames(sc)]) %>%
                       dplyr::inner_join(annot_ensembl, by="ensembl_gene_id"),
                     file=file.path(param$path_out, "annotation", "gene_annotation.xlsx"), 
                     row.names=FALSE, col.names=TRUE)

# Data and annotation for a subset of 500 cells for Morpheus
cells_subset = ScSampleCells(sc, n=500, seed=1)
openxlsx::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="data") %>% as.data.frame() %>% tibble::rownames_to_column(var="gene"), 
                     file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_normalised_data.xlsx")),
                     row.names=FALSE, col.names=TRUE)
openxlsx::write.xlsx(Seurat::GetAssayData(sc[, cells_subset], slot="scale.data") %>% as.data.frame() %>% tibble::rownames_to_column(var="gene"), 
                     file=file.path(param$path_out, "data", paste0("subset_", 500, "_cells_scaled_data.xlsx")),
                     row.names=FALSE, col.names=TRUE)
openxlsx::write.xlsx(sc[[]][cells_subset, ] %>% as.data.frame() %>% tibble::rownames_to_column(var="cell"), 
           file=file.path(param$path_out, "data",paste0("subset_", 500, "_cells_column_annotations.xlsx")),
           row.names=FALSE, col.names=TRUE)

Result files are: * data/sc.rds: Seurat object for R * data/counts: Raw counts of the entire dataset as sparse matrix * annotation/gene_annotation.xlsx: Ensembl annotation of the genes used in the Seurat object * data/subset_500_cells_normalised_data.xlsx: Normalised subset of 500 cells (for heatmap plotting with morpheus (https://software.broadinstitute.org/morpheus/)) * data/subset_500_cells_scaled_data.xlsx: Normalised and scaled subset of 500 cells (for heatmap plotting with morpheus (https://software.broadinstitute.org/morpheus/)) * data/subset_500_cells_column_annotations.xlsx.xlsx: Cell annotations for the 500 cells subset (for heatmap plotting with morpheus (https://software.broadinstitute.org/morpheus/))

Parameters and software versions

The following parameters were used to run the workflow.

out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
project_id pbmc
path_data name:pbmc_10x, pbmc_smartseq2; type:10x, smartseq2; path:test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x, test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz; stats:NA, NA; suffix:-1, -2
path_out test_datasets/10x_SmartSeq2_pbmc_GSE132044/results
file_known_markers test_datasets/10x_SmartSeq2_pbmc_GSE132044/known_markers.xlsx
mart_dataset hsapiens_gene_ensembl
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
mt ^MT-
cell_filter pbmc_10x:nFeature_RNA=c(200, NA), percent_mt=c(NA, 20); pbmc_smartseq2_sample1:nFeature_RNA=c(200, NA), percent_mt=c(NA, 20)
feature_filter pbmc_10x:min_counts=1, min_cells=3; pbmc_smartseq2_sample1:min_counts=1, min_cells=3
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge TRUE
integrate_samples method:integrate; dimensions:30; reference:; use_reciprocal_pca:FALSE
pc_n 10
cluster_resolution 0.5
marker_padj 0.05
marker_log2FC 1
marker_pct 0.25
deg_contrasts condition_column:orig.ident, orig.ident, Phase; condition_group1:pbmc_10x, pbmc_10x, G1; condition_group2:pbmc_smartseq2_sample1, pbmc_smartseq2_sample1, G2M; subset_column:NA, seurat_clusters, seurat_clusters; subset_group:NA, , 1;2; downsample_cells_n:NA, 50, 30; log2FC:0.584962500721156, 0.584962500721156, 0.584962500721156
enrichr_padj 0.05
enrichr_dbs GO_Molecular_Function_2018, GO_Biological_Process_2018, GO_Cellular_Component_2018
col palevioletred
col_palette_samples ggsci::pal_jama
col_palette_clusters ggsci::pal_igv
path_to_git .
debugging_mode default_debugging
file_annot test_datasets/10x_SmartSeq2_pbmc_GSE132044/results//annotation/hsapiens_gene_ensembl.v98.annot.txt
downsample_cells_n 0
biomart_mirror www
samples_to_drop
col_samples pbmc_10x=#374E55FF, pbmc_smartseq2_sample1=#DF8F44FF
col_clusters 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF, 8=#802268FF, 9=#6BD76BFF, 10=#D595A7FF

This report was created with generated using the scrnaseq GitHub repository. Software versions were collected at run time.

out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Version
ktrns/scrnaseq 40666f38c491be9ae7b39b87e44cc45ab79ad2b9
R R version 4.1.0 (2021-05-18)
Platform x86_64-w64-mingw32/x64 (64-bit)
Operating system Windows 10 x64 (build 19041)
Packages abind1.4-5, annotate1.70.0, AnnotationDbi1.54.0, ape5.5, assertthat0.2.1, babelgene21.4, beachmat2.8.0, bibtex0.4.3, Biobase2.52.0, BiocFileCache2.0.0, BiocGenerics0.38.0, BiocParallel1.26.0, BiocSingular1.8.0, biomaRt2.48.0, Biostrings2.60.0, bit4.0.4, bit644.0.5, bitops1.0-7, blob1.2.1, bslib0.2.5.1, cachem1.0.5, cerebroApp1.3.1, cli2.5.0, cluster2.1.2, codetools0.2-18, colorspace2.0-1, colourpicker1.1.0, cowplot1.1.1, crayon1.4.1, curl4.3.1, data.table1.14.0, DBI1.1.1, dbplyr2.1.1, DelayedArray0.18.0, DelayedMatrixStats1.14.0, deldir0.2-10, digest0.6.27, dplyr1.0.6, DT0.18, ellipsis0.3.2, enrichR3.0, evaluate0.14, fansi0.4.2, farver2.1.0, fastmap1.1.0, filelock1.0.2, fitdistrplus1.1-5, fs1.5.0, future1.21.0, future.apply1.7.0, generics0.1.0, GenomeInfoDb1.28.0, GenomeInfoDbData1.2.6, GenomicRanges1.44.0, ggplot23.3.3, ggrepel0.9.1, ggridges0.5.3, ggsci2.9, globals0.14.0, glue1.4.2, goftest1.2-2, graph1.70.0, gridExtra2.3, GSEABase1.54.0, GSVA1.40.0, gtable0.3.0, HDF5Array1.20.0, highr0.9, hms1.1.0, htmltools0.5.1.1, htmlwidgets1.5.3, httpuv1.6.1, httr1.4.2, ica1.0-2, igraph1.2.6, IRanges2.26.0, irlba2.3.3, jquerylib0.1.4, jsonlite1.7.2, kableExtra1.3.4, KEGGREST1.32.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.33, labeling0.4.2, later1.2.0, lattice0.20-44, lazyeval0.2.2, leiden0.3.8, lifecycle1.0.0, limma3.48.0, listenv0.8.0, lmtest0.9-38, lubridate1.7.10, magrittr2.0.1, MASS7.3-54, MAST1.18.0, Matrix1.3-3, MatrixGenerics1.4.0, matrixStats0.58.0, memoise2.0.0, mgcv1.8-35, mime0.10, miniUI0.1.1.1, msigdbr7.4.1, munsell0.5.0, nlme3.1-152, openxlsx4.2.3, parallelly1.25.0, patchwork1.1.1, pbapply1.4-3, pillar1.6.1, pkgconfig2.0.3, plotly4.9.3, plyr1.8.6, png0.1-7, polyclip1.10-0, prettyunits1.1.1, progress1.2.2, promises1.2.0.1, purrr0.3.4, qvalue2.24.0, R.methodsS31.8.1, R.oo1.24.0, R.utils2.10.1, R62.5.0, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-2, Rcpp1.0.6, RcppAnnoy0.0.18, RCurl1.98-1.3, readr1.4.0, RefManageR1.3.0, reshape21.4.4, reticulate1.20, rhdf52.36.0, rhdf5filters1.4.0, Rhdf5lib1.14.0, rjson0.2.20, rlang0.4.11, rmarkdown2.8, ROCR1.0-11, rpart4.1-15, RSpectra0.16-0, RSQLite2.2.7, rstudioapi0.13, rsvd1.0.5, Rtsne0.15, rvest1.0.0, S4Vectors0.30.0, sass0.4.0, ScaledMatrix1.0.0, scales1.1.1, scattermore0.7, sceasy0.0.6, sctransform0.3.2, sessioninfo1.1.1, Seurat4.0.2, SeuratObject4.0.1, shiny1.6.0, shinycssloaders1.0.0, shinydashboard0.7.1, shinyFiles0.9.0, shinyjs2.0.0, shinyWidgets0.6.0, SingleCellExperiment1.14.1, sparseMatrixStats1.4.0, spatstat.core2.1-2, spatstat.data2.1-0, spatstat.geom2.1-0, spatstat.sparse2.0-0, spatstat.utils2.1-0, stringi1.6.1, stringr1.4.0, SummarizedExperiment1.22.0, survival3.2-11, svglite2.0.0, systemfonts1.0.2, tensor1.5, tibble3.1.2, tidyr1.1.3, tidyselect1.1.1, utf81.2.1, uwot0.1.10, vctrs0.3.8, viridis0.6.1, viridisLite0.4.0, webshot0.5.2, withr2.4.2, xfun0.23, XML3.99-0.6, xml21.3.2, xtable1.8-4, XVector0.32.0, yaml2.2.1, zip2.2.0, zlibbioc1.38.0, zoo1.8-9

References

# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file="references.bib")
chanzuckerberg. 2021. “Chanzuckerberg/Cellxgene.” GitHub. https://github.com/chanzuckerberg/cellxgene. https://github.com/chanzuckerberg/cellxgene.
Chen, Edward Y. 2020. “Enrichr.” https://amp.pharm.mssm.edu/Enrichr/.
Gandolfo, Luke C., and Terence P. Speed. 2018. RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2): e0191629. https://doi.org/10.1371/journal.pone.0191629.
Hafemeister, Christoph, and Rahul Satija. 2019. “Normalization and Variance Stabilization of Single-Cell RNA-Seq Data Using Regularized Negative Binomial Regression.” Genome Biology 20 (1). https://doi.org/10.1186/s13059-019-1874-1.
romanhaa. 2021. “Romanhaa/cerebroApp.” GitHub. https://github.com/romanhaa/cerebroApp. https://github.com/romanhaa/cerebroApp.
“Satija Lab.” 2020. http://www.satijalab.org/seurat/v3.1/cell_cycle_vignette.html. https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html.
Tirosh, I., B. Izar, S. M. Prakadan, M. H. Wadsworth, D. Treacy, J. J. Trombetta, A. Rotem, et al. 2016. “Dissecting the Multicellular Ecosystem of Metastatic Melanoma by Single-Cell RNA-Seq.” Science 352 (6282): 189–96. https://doi.org/10.1126/science.aad0501.
Traag, V. A., L. Waltman, and N. J. van Eck. 2019. “From Louvain to Leiden: Guaranteeing Well-Connected Communities.” Scientific Reports 9 (1). https://doi.org/10.1038/s41598-019-41695-z.
“Understanding UMAP.” 2019. https://pair-code.github.io/understanding-umap/. https://pair-code.github.io/understanding-umap/.
“What Is Loupe Browser? -Software -Single Cell Gene Expression -Official 10x Genomics Support.” 2021. https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser.